twitterscraper-ruby 0.14.0 → 0.15.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/README.md +25 -16
- data/lib/twitterscraper/cli.rb +3 -0
- data/lib/twitterscraper/query.rb +35 -26
- data/lib/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a950fb24329aaa1020441e258a8a2144100d732142b6c227bb9b026b8bb73996
|
4
|
+
data.tar.gz: 1f64f31e43189e2ee439f5ef6f6d54bc6ea58895adbed67cb8ddbe91af07681a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8573affbc9a5faa05e5e489364bb2ba0da1aa4f12af35445e5de8b1f8c399eb0575cc9f408b2ba96c3d7fd8b2a74b7dd703229053a33c1f8a883856818033cb9
|
7
|
+
data.tar.gz: 2b2b3ad0b2dd9d089a7b6127ed1b0db21e7f4fa5f0c31e6b366d9b5ae444e2244d4200c813b7a3257f43702d2caa9f264515e701602c24f4482a746b89d41328
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -33,26 +33,39 @@ $ gem install twitterscraper-ruby
|
|
33
33
|
Command-line interface:
|
34
34
|
|
35
35
|
```shell script
|
36
|
-
|
37
|
-
|
36
|
+
# Returns a collection of relevant tweets matching a specified query.
|
37
|
+
$ twitterscraper --type search --query KEYWORD --start_date 2020-06-01 --end_date 2020-06-30 --lang ja \
|
38
|
+
--limit 100 --threads 10 --output tweets.json
|
39
|
+
```
|
40
|
+
|
41
|
+
```shell script
|
42
|
+
# Returns a collection of the most recent tweets posted by the user indicated by the screen_name
|
43
|
+
$ twitterscraper --type user --query SCREEN_NAME --limit 100 --output tweets.json
|
38
44
|
```
|
39
45
|
|
40
46
|
From Within Ruby:
|
41
47
|
|
42
48
|
```ruby
|
43
49
|
require 'twitterscraper'
|
50
|
+
client = Twitterscraper::Client.new(cache: true, proxy: true)
|
51
|
+
```
|
44
52
|
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
53
|
+
```ruby
|
54
|
+
# Returns a collection of relevant tweets matching a specified query.
|
55
|
+
tweets = client.search(KEYWORD, start_date: '2020-06-01', end_date: '2020-06-30', lang: 'ja', limit: 100, threads: 10)
|
56
|
+
```
|
57
|
+
|
58
|
+
```ruby
|
59
|
+
# Returns a collection of the most recent tweets posted by the user indicated by the screen_name
|
60
|
+
tweets = client.user_timeline(SCREEN_NAME, limit: 100)
|
61
|
+
```
|
52
62
|
|
53
|
-
client = Twitterscraper::Client.new(cache: true, proxy: true)
|
54
|
-
tweets = client.query_tweets(KEYWORD, options)
|
55
63
|
|
64
|
+
## Attributes
|
65
|
+
|
66
|
+
### Tweet
|
67
|
+
|
68
|
+
```ruby
|
56
69
|
tweets.each do |tweet|
|
57
70
|
puts tweet.tweet_id
|
58
71
|
puts tweet.text
|
@@ -64,11 +77,6 @@ tweets.each do |tweet|
|
|
64
77
|
end
|
65
78
|
```
|
66
79
|
|
67
|
-
|
68
|
-
## Attributes
|
69
|
-
|
70
|
-
### Tweet
|
71
|
-
|
72
80
|
- screen_name
|
73
81
|
- name
|
74
82
|
- user_id
|
@@ -136,6 +144,7 @@ $ cat tweets.json | jq . | less
|
|
136
144
|
| Option | Description | Default |
|
137
145
|
| ------------- | ------------- | ------------- |
|
138
146
|
| `-h`, `--help` | This option displays a summary of twitterscraper. | |
|
147
|
+
| `--type` | Specify a search type. | search |
|
139
148
|
| `--query` | Specify a keyword used during the search. | |
|
140
149
|
| `--start_date` | Used as "since:yyyy-mm-dd for your query. This means "since the date". | |
|
141
150
|
| `--end_date` | Used as "until:yyyy-mm-dd for your query. This means "before the date". | |
|
data/lib/twitterscraper/cli.rb
CHANGED
@@ -16,6 +16,7 @@ module Twitterscraper
|
|
16
16
|
print_version || return if print_version?
|
17
17
|
|
18
18
|
query_options = {
|
19
|
+
type: options['type'],
|
19
20
|
start_date: options['start_date'],
|
20
21
|
end_date: options['end_date'],
|
21
22
|
lang: options['lang'],
|
@@ -59,6 +60,7 @@ module Twitterscraper
|
|
59
60
|
'help',
|
60
61
|
'v',
|
61
62
|
'version',
|
63
|
+
'type:',
|
62
64
|
'query:',
|
63
65
|
'start_date:',
|
64
66
|
'end_date:',
|
@@ -75,6 +77,7 @@ module Twitterscraper
|
|
75
77
|
'verbose',
|
76
78
|
)
|
77
79
|
|
80
|
+
options['type'] ||= 'search'
|
78
81
|
options['start_date'] = Query::OLDEST_DATE if options['start_date'] == 'oldest'
|
79
82
|
options['lang'] ||= ''
|
80
83
|
options['limit'] = (options['limit'] || 100).to_i
|
data/lib/twitterscraper/query.rb
CHANGED
@@ -22,23 +22,24 @@ module Twitterscraper
|
|
22
22
|
RELOAD_URL = 'https://twitter.com/i/search/timeline?f=tweets&vertical=' +
|
23
23
|
'default&include_available_features=1&include_entities=1&' +
|
24
24
|
'reset_error_state=false&src=typd&max_position=__POS__&q=__QUERY__&l=__LANG__'
|
25
|
-
INIT_URL_USER = 'https://twitter.com/
|
26
|
-
RELOAD_URL_USER = 'https://twitter.com/i/profiles/show/
|
25
|
+
INIT_URL_USER = 'https://twitter.com/__USER__'
|
26
|
+
RELOAD_URL_USER = 'https://twitter.com/i/profiles/show/__USER__/timeline/tweets?' +
|
27
27
|
'include_available_features=1&include_entities=1&' +
|
28
|
-
'max_position=
|
29
|
-
|
30
|
-
def build_query_url(query, lang,
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
# end
|
38
|
-
if pos
|
39
|
-
RELOAD_URL.sub('__QUERY__', query).sub('__LANG__', lang.to_s).sub('__POS__', pos)
|
28
|
+
'max_position=__POS__&reset_error_state=false'
|
29
|
+
|
30
|
+
def build_query_url(query, lang, from_user, pos)
|
31
|
+
if from_user
|
32
|
+
if pos
|
33
|
+
RELOAD_URL_USER.sub('__USER__', query).sub('__POS__', pos.to_s)
|
34
|
+
else
|
35
|
+
INIT_URL_USER.sub('__USER__', query)
|
36
|
+
end
|
40
37
|
else
|
41
|
-
|
38
|
+
if pos
|
39
|
+
RELOAD_URL.sub('__QUERY__', query).sub('__LANG__', lang.to_s).sub('__POS__', pos)
|
40
|
+
else
|
41
|
+
INIT_URL.sub('__QUERY__', query).sub('__LANG__', lang.to_s)
|
42
|
+
end
|
42
43
|
end
|
43
44
|
end
|
44
45
|
|
@@ -74,11 +75,11 @@ module Twitterscraper
|
|
74
75
|
[items_html, json_resp]
|
75
76
|
end
|
76
77
|
|
77
|
-
def query_single_page(query, lang,
|
78
|
+
def query_single_page(query, lang, type, pos, headers: [], proxies: [])
|
78
79
|
logger.info "Querying #{query}"
|
79
80
|
query = ERB::Util.url_encode(query)
|
80
81
|
|
81
|
-
url = build_query_url(query, lang,
|
82
|
+
url = build_query_url(query, lang, type == 'user', pos)
|
82
83
|
http_request = lambda do
|
83
84
|
logger.debug "Scraping tweets from #{url}"
|
84
85
|
get_single_page(url, headers, proxies)
|
@@ -107,8 +108,8 @@ module Twitterscraper
|
|
107
108
|
|
108
109
|
if json_resp
|
109
110
|
[tweets, json_resp['min_position']]
|
110
|
-
elsif
|
111
|
-
|
111
|
+
elsif type
|
112
|
+
[tweets, tweets[-1].tweet_id]
|
112
113
|
else
|
113
114
|
[tweets, "TWEET-#{tweets[-1].tweet_id}-#{tweets[0].tweet_id}"]
|
114
115
|
end
|
@@ -116,7 +117,7 @@ module Twitterscraper
|
|
116
117
|
|
117
118
|
OLDEST_DATE = Date.parse('2006-03-21')
|
118
119
|
|
119
|
-
def validate_options!(queries, start_date:, end_date:, lang:, limit:, threads:)
|
120
|
+
def validate_options!(queries, type:, start_date:, end_date:, lang:, limit:, threads:)
|
120
121
|
query = queries[0]
|
121
122
|
if query.nil? || query == ''
|
122
123
|
raise Error.new('Please specify a search query.')
|
@@ -161,12 +162,12 @@ module Twitterscraper
|
|
161
162
|
end
|
162
163
|
end
|
163
164
|
|
164
|
-
def main_loop(query, lang, limit, daily_limit, headers, proxies)
|
165
|
+
def main_loop(query, lang, type, limit, daily_limit, headers, proxies)
|
165
166
|
pos = nil
|
166
167
|
daily_tweets = []
|
167
168
|
|
168
169
|
while true
|
169
|
-
new_tweets, new_pos = query_single_page(query, lang, pos, headers: headers, proxies: proxies)
|
170
|
+
new_tweets, new_pos = query_single_page(query, lang, type, pos, headers: headers, proxies: proxies)
|
170
171
|
unless new_tweets.empty?
|
171
172
|
daily_tweets.concat(new_tweets)
|
172
173
|
daily_tweets.uniq! { |t| t.tweet_id }
|
@@ -195,7 +196,7 @@ module Twitterscraper
|
|
195
196
|
@stop_requested
|
196
197
|
end
|
197
198
|
|
198
|
-
def query_tweets(query, start_date: nil, end_date: nil, lang:
|
199
|
+
def query_tweets(query, type: 'search', start_date: nil, end_date: nil, lang: nil, limit: 100, daily_limit: nil, order: 'desc', threads: 2)
|
199
200
|
start_date = Date.parse(start_date) if start_date && start_date.is_a?(String)
|
200
201
|
end_date = Date.parse(end_date) if end_date && end_date.is_a?(String)
|
201
202
|
queries = build_queries(query, start_date, end_date)
|
@@ -213,7 +214,7 @@ module Twitterscraper
|
|
213
214
|
logger.debug "Cache #{cache_enabled? ? 'enabled' : 'disabled'}"
|
214
215
|
|
215
216
|
|
216
|
-
validate_options!(queries, start_date: start_date, end_date: end_date, lang: lang, limit: limit, threads: threads)
|
217
|
+
validate_options!(queries, type: type, start_date: start_date, end_date: end_date, lang: lang, limit: limit, threads: threads)
|
217
218
|
|
218
219
|
logger.info "The number of threads #{threads}"
|
219
220
|
|
@@ -229,17 +230,25 @@ module Twitterscraper
|
|
229
230
|
logger.debug "Set 'Thread.abort_on_exception' to true"
|
230
231
|
|
231
232
|
Parallel.each(queries, in_threads: threads) do |query|
|
232
|
-
main_loop(query, lang, limit, daily_limit, headers, proxies)
|
233
|
+
main_loop(query, lang, type, limit, daily_limit, headers, proxies)
|
233
234
|
raise Parallel::Break if stop_requested?
|
234
235
|
end
|
235
236
|
else
|
236
237
|
queries.each do |query|
|
237
|
-
main_loop(query, lang, limit, daily_limit, headers, proxies)
|
238
|
+
main_loop(query, lang, type, limit, daily_limit, headers, proxies)
|
238
239
|
break if stop_requested?
|
239
240
|
end
|
240
241
|
end
|
241
242
|
|
242
243
|
@all_tweets.sort_by { |tweet| (order == 'desc' ? -1 : 1) * tweet.created_at.to_i }
|
243
244
|
end
|
245
|
+
|
246
|
+
def search(query, start_date: nil, end_date: nil, lang: '', limit: 100, daily_limit: nil, order: 'desc', threads: 2)
|
247
|
+
query_tweets(query, type: 'search', start_date: start_date, end_date: end_date, lang: lang, limit: limit, daily_limit: daily_limit, order: order, threads: threads)
|
248
|
+
end
|
249
|
+
|
250
|
+
def user_timeline(screen_name, limit: 100, order: 'desc')
|
251
|
+
query_tweets(screen_name, type: 'user', start_date: nil, end_date: nil, lang: nil, limit: limit, daily_limit: nil, order: order, threads: 1)
|
252
|
+
end
|
244
253
|
end
|
245
254
|
end
|
data/lib/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: twitterscraper-ruby
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.15.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- ts-3156
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-07-
|
11
|
+
date: 2020-07-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|