twitterscraper-ruby 0.10.0 → 0.15.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.circleci/config.yml +31 -0
- data/.gitignore +1 -1
- data/.rspec +2 -0
- data/Gemfile +1 -0
- data/Gemfile.lock +16 -1
- data/README.md +32 -20
- data/bin/twitterscraper +1 -1
- data/lib/twitterscraper.rb +1 -0
- data/lib/twitterscraper/cache.rb +69 -0
- data/lib/twitterscraper/cli.rb +17 -4
- data/lib/twitterscraper/client.rb +13 -0
- data/lib/twitterscraper/proxy.rb +5 -4
- data/lib/twitterscraper/query.rb +93 -47
- data/lib/twitterscraper/tweet.rb +18 -3
- data/lib/version.rb +1 -1
- metadata +5 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a950fb24329aaa1020441e258a8a2144100d732142b6c227bb9b026b8bb73996
|
4
|
+
data.tar.gz: 1f64f31e43189e2ee439f5ef6f6d54bc6ea58895adbed67cb8ddbe91af07681a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8573affbc9a5faa05e5e489364bb2ba0da1aa4f12af35445e5de8b1f8c399eb0575cc9f408b2ba96c3d7fd8b2a74b7dd703229053a33c1f8a883856818033cb9
|
7
|
+
data.tar.gz: 2b2b3ad0b2dd9d089a7b6127ed1b0db21e7f4fa5f0c31e6b366d9b5ae444e2244d4200c813b7a3257f43702d2caa9f264515e701602c24f4482a746b89d41328
|
@@ -0,0 +1,31 @@
|
|
1
|
+
version: 2.1
|
2
|
+
orbs:
|
3
|
+
ruby: circleci/ruby@0.1.2
|
4
|
+
|
5
|
+
jobs:
|
6
|
+
build:
|
7
|
+
docker:
|
8
|
+
- image: circleci/ruby:2.6.4-stretch-node
|
9
|
+
environment:
|
10
|
+
BUNDLER_VERSION: 2.1.4
|
11
|
+
executor: ruby/default
|
12
|
+
steps:
|
13
|
+
- checkout
|
14
|
+
- run:
|
15
|
+
name: Update bundler
|
16
|
+
command: gem update bundler
|
17
|
+
- run:
|
18
|
+
name: Which bundler?
|
19
|
+
command: bundle -v
|
20
|
+
- restore_cache:
|
21
|
+
keys:
|
22
|
+
- gem-cache-v1-{{ arch }}-{{ .Branch }}-{{ checksum "Gemfile.lock" }}
|
23
|
+
- gem-cache-v1-{{ arch }}-{{ .Branch }}
|
24
|
+
- gem-cache-v1
|
25
|
+
- run: bundle install --path vendor/bundle
|
26
|
+
- run: bundle clean
|
27
|
+
- save_cache:
|
28
|
+
key: gem-cache-v1-{{ arch }}-{{ .Branch }}-{{ checksum "Gemfile.lock" }}
|
29
|
+
paths:
|
30
|
+
- vendor/bundle
|
31
|
+
- run: bundle exec rspec
|
data/.gitignore
CHANGED
data/.rspec
ADDED
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,19 +1,33 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
twitterscraper-ruby (0.
|
4
|
+
twitterscraper-ruby (0.15.0)
|
5
5
|
nokogiri
|
6
6
|
parallel
|
7
7
|
|
8
8
|
GEM
|
9
9
|
remote: https://rubygems.org/
|
10
10
|
specs:
|
11
|
+
diff-lcs (1.4.4)
|
11
12
|
mini_portile2 (2.4.0)
|
12
13
|
minitest (5.14.1)
|
13
14
|
nokogiri (1.10.10)
|
14
15
|
mini_portile2 (~> 2.4.0)
|
15
16
|
parallel (1.19.2)
|
16
17
|
rake (12.3.3)
|
18
|
+
rspec (3.9.0)
|
19
|
+
rspec-core (~> 3.9.0)
|
20
|
+
rspec-expectations (~> 3.9.0)
|
21
|
+
rspec-mocks (~> 3.9.0)
|
22
|
+
rspec-core (3.9.2)
|
23
|
+
rspec-support (~> 3.9.3)
|
24
|
+
rspec-expectations (3.9.2)
|
25
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
26
|
+
rspec-support (~> 3.9.0)
|
27
|
+
rspec-mocks (3.9.1)
|
28
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
29
|
+
rspec-support (~> 3.9.0)
|
30
|
+
rspec-support (3.9.3)
|
17
31
|
|
18
32
|
PLATFORMS
|
19
33
|
ruby
|
@@ -21,6 +35,7 @@ PLATFORMS
|
|
21
35
|
DEPENDENCIES
|
22
36
|
minitest (~> 5.0)
|
23
37
|
rake (~> 12.0)
|
38
|
+
rspec
|
24
39
|
twitterscraper-ruby!
|
25
40
|
|
26
41
|
BUNDLED WITH
|
data/README.md
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
# twitterscraper-ruby
|
2
2
|
|
3
|
+
[](https://circleci.com/gh/ts-3156/twitterscraper-ruby)
|
3
4
|
[](https://badge.fury.io/rb/twitterscraper-ruby)
|
4
5
|
|
5
6
|
A gem to scrape https://twitter.com/search. This gem is inspired by [taspinar/twitterscraper](https://github.com/taspinar/twitterscraper).
|
@@ -32,27 +33,39 @@ $ gem install twitterscraper-ruby
|
|
32
33
|
Command-line interface:
|
33
34
|
|
34
35
|
```shell script
|
35
|
-
|
36
|
-
|
36
|
+
# Returns a collection of relevant tweets matching a specified query.
|
37
|
+
$ twitterscraper --type search --query KEYWORD --start_date 2020-06-01 --end_date 2020-06-30 --lang ja \
|
38
|
+
--limit 100 --threads 10 --output tweets.json
|
39
|
+
```
|
40
|
+
|
41
|
+
```shell script
|
42
|
+
# Returns a collection of the most recent tweets posted by the user indicated by the screen_name
|
43
|
+
$ twitterscraper --type user --query SCREEN_NAME --limit 100 --output tweets.json
|
37
44
|
```
|
38
45
|
|
39
46
|
From Within Ruby:
|
40
47
|
|
41
48
|
```ruby
|
42
49
|
require 'twitterscraper'
|
50
|
+
client = Twitterscraper::Client.new(cache: true, proxy: true)
|
51
|
+
```
|
52
|
+
|
53
|
+
```ruby
|
54
|
+
# Returns a collection of relevant tweets matching a specified query.
|
55
|
+
tweets = client.search(KEYWORD, start_date: '2020-06-01', end_date: '2020-06-30', lang: 'ja', limit: 100, threads: 10)
|
56
|
+
```
|
43
57
|
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
limit: 100,
|
49
|
-
threads: 10,
|
50
|
-
proxy: true
|
51
|
-
}
|
58
|
+
```ruby
|
59
|
+
# Returns a collection of the most recent tweets posted by the user indicated by the screen_name
|
60
|
+
tweets = client.user_timeline(SCREEN_NAME, limit: 100)
|
61
|
+
```
|
52
62
|
|
53
|
-
client = Twitterscraper::Client.new
|
54
|
-
tweets = client.query_tweets(KEYWORD, options)
|
55
63
|
|
64
|
+
## Attributes
|
65
|
+
|
66
|
+
### Tweet
|
67
|
+
|
68
|
+
```ruby
|
56
69
|
tweets.each do |tweet|
|
57
70
|
puts tweet.tweet_id
|
58
71
|
puts tweet.text
|
@@ -64,11 +77,6 @@ tweets.each do |tweet|
|
|
64
77
|
end
|
65
78
|
```
|
66
79
|
|
67
|
-
|
68
|
-
## Attributes
|
69
|
-
|
70
|
-
### Tweet
|
71
|
-
|
72
80
|
- screen_name
|
73
81
|
- name
|
74
82
|
- user_id
|
@@ -136,15 +144,19 @@ $ cat tweets.json | jq . | less
|
|
136
144
|
| Option | Description | Default |
|
137
145
|
| ------------- | ------------- | ------------- |
|
138
146
|
| `-h`, `--help` | This option displays a summary of twitterscraper. | |
|
147
|
+
| `--type` | Specify a search type. | search |
|
139
148
|
| `--query` | Specify a keyword used during the search. | |
|
140
|
-
| `--start_date` |
|
141
|
-
| `--end_date` |
|
149
|
+
| `--start_date` | Used as "since:yyyy-mm-dd for your query. This means "since the date". | |
|
150
|
+
| `--end_date` | Used as "until:yyyy-mm-dd for your query. This means "before the date". | |
|
142
151
|
| `--lang` | Retrieve tweets written in a specific language. | |
|
143
152
|
| `--limit` | Stop scraping when *at least* the number of tweets indicated with --limit is scraped. | 100 |
|
153
|
+
| `--order` | Sort order of the results. | desc |
|
144
154
|
| `--threads` | Set the number of threads twitterscraper-ruby should initiate while scraping for your query. | 2 |
|
145
|
-
| `--proxy` | Scrape https://twitter.com/search via proxies. |
|
155
|
+
| `--proxy` | Scrape https://twitter.com/search via proxies. | true |
|
156
|
+
| `--cache` | Enable caching. | true |
|
146
157
|
| `--format` | The format of the output. | json |
|
147
158
|
| `--output` | The name of the output file. | tweets.json |
|
159
|
+
| `--verbose` | Print debug messages. | tweets.json |
|
148
160
|
|
149
161
|
|
150
162
|
## Contributing
|
data/bin/twitterscraper
CHANGED
data/lib/twitterscraper.rb
CHANGED
@@ -0,0 +1,69 @@
|
|
1
|
+
require 'base64'
|
2
|
+
require 'digest/md5'
|
3
|
+
|
4
|
+
module Twitterscraper
|
5
|
+
class Cache
|
6
|
+
def initialize()
|
7
|
+
@ttl = 3600 # 1 hour
|
8
|
+
@dir = 'cache'
|
9
|
+
Dir.mkdir(@dir) unless File.exist?(@dir)
|
10
|
+
end
|
11
|
+
|
12
|
+
def read(key)
|
13
|
+
key = cache_key(key)
|
14
|
+
file = File.join(@dir, key)
|
15
|
+
entry = Entry.from_json(File.read(file))
|
16
|
+
entry.value if entry.time > Time.now - @ttl
|
17
|
+
rescue Errno::ENOENT => e
|
18
|
+
nil
|
19
|
+
end
|
20
|
+
|
21
|
+
def write(key, value)
|
22
|
+
key = cache_key(key)
|
23
|
+
entry = Entry.new(key, value, Time.now)
|
24
|
+
file = File.join(@dir, key)
|
25
|
+
File.write(file, entry.to_json)
|
26
|
+
end
|
27
|
+
|
28
|
+
def fetch(key, &block)
|
29
|
+
if (value = read(key))
|
30
|
+
value
|
31
|
+
else
|
32
|
+
yield.tap { |v| write(key, v) }
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def cache_key(key)
|
37
|
+
value = key.gsub(':', '%3A').gsub('/', '%2F').gsub('?', '%3F').gsub('=', '%3D').gsub('&', '%26')
|
38
|
+
value = Digest::MD5.hexdigest(value) if value.length >= 100
|
39
|
+
value
|
40
|
+
end
|
41
|
+
|
42
|
+
class Entry < Hash
|
43
|
+
attr_reader :key, :value, :time
|
44
|
+
|
45
|
+
def initialize(key, value, time)
|
46
|
+
@key = key
|
47
|
+
@value = value
|
48
|
+
@time = time
|
49
|
+
end
|
50
|
+
|
51
|
+
def attrs
|
52
|
+
{key: @key, value: @value, time: @time}
|
53
|
+
end
|
54
|
+
|
55
|
+
def to_json
|
56
|
+
hash = attrs
|
57
|
+
hash[:value] = Base64.encode64(hash[:value])
|
58
|
+
hash.to_json
|
59
|
+
end
|
60
|
+
|
61
|
+
class << self
|
62
|
+
def from_json(text)
|
63
|
+
json = JSON.parse(text)
|
64
|
+
new(json['key'], Base64.decode64(json['value']), Time.parse(json['time']))
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
data/lib/twitterscraper/cli.rb
CHANGED
@@ -16,14 +16,16 @@ module Twitterscraper
|
|
16
16
|
print_version || return if print_version?
|
17
17
|
|
18
18
|
query_options = {
|
19
|
+
type: options['type'],
|
19
20
|
start_date: options['start_date'],
|
20
21
|
end_date: options['end_date'],
|
21
22
|
lang: options['lang'],
|
22
23
|
limit: options['limit'],
|
24
|
+
daily_limit: options['daily_limit'],
|
25
|
+
order: options['order'],
|
23
26
|
threads: options['threads'],
|
24
|
-
proxy: options['proxy']
|
25
27
|
}
|
26
|
-
client = Twitterscraper::Client.new
|
28
|
+
client = Twitterscraper::Client.new(cache: options['cache'], proxy: options['proxy'])
|
27
29
|
tweets = client.query_tweets(options['query'], query_options)
|
28
30
|
export(tweets) unless tweets.empty?
|
29
31
|
end
|
@@ -58,25 +60,36 @@ module Twitterscraper
|
|
58
60
|
'help',
|
59
61
|
'v',
|
60
62
|
'version',
|
63
|
+
'type:',
|
61
64
|
'query:',
|
62
65
|
'start_date:',
|
63
66
|
'end_date:',
|
64
67
|
'lang:',
|
65
68
|
'limit:',
|
69
|
+
'daily_limit:',
|
70
|
+
'order:',
|
66
71
|
'threads:',
|
67
72
|
'output:',
|
68
73
|
'format:',
|
69
|
-
'
|
74
|
+
'cache:',
|
75
|
+
'proxy:',
|
70
76
|
'pretty',
|
71
77
|
'verbose',
|
72
78
|
)
|
73
79
|
|
80
|
+
options['type'] ||= 'search'
|
81
|
+
options['start_date'] = Query::OLDEST_DATE if options['start_date'] == 'oldest'
|
74
82
|
options['lang'] ||= ''
|
75
83
|
options['limit'] = (options['limit'] || 100).to_i
|
84
|
+
options['daily_limit'] = options['daily_limit'].to_i if options['daily_limit']
|
76
85
|
options['threads'] = (options['threads'] || 2).to_i
|
77
86
|
options['format'] ||= 'json'
|
87
|
+
options['order'] ||= 'desc'
|
78
88
|
options['output'] ||= "tweets.#{options['format']}"
|
79
89
|
|
90
|
+
options['cache'] = options['cache'] != 'false'
|
91
|
+
options['proxy'] = options['proxy'] != 'false'
|
92
|
+
|
80
93
|
options
|
81
94
|
end
|
82
95
|
|
@@ -100,7 +113,7 @@ module Twitterscraper
|
|
100
113
|
end
|
101
114
|
|
102
115
|
def print_version
|
103
|
-
puts "twitterscraper-#{
|
116
|
+
puts "twitterscraper-#{VERSION}"
|
104
117
|
end
|
105
118
|
end
|
106
119
|
end
|
data/lib/twitterscraper/proxy.rb
CHANGED
@@ -17,15 +17,17 @@ module Twitterscraper
|
|
17
17
|
reload
|
18
18
|
end
|
19
19
|
@cur_index += 1
|
20
|
-
|
21
|
-
Twitterscraper.logger.info("Using proxy #{item}")
|
22
|
-
item
|
20
|
+
@items[@cur_index - 1]
|
23
21
|
end
|
24
22
|
|
25
23
|
def size
|
26
24
|
@items.size
|
27
25
|
end
|
28
26
|
|
27
|
+
def empty?
|
28
|
+
@items.empty?
|
29
|
+
end
|
30
|
+
|
29
31
|
private
|
30
32
|
|
31
33
|
def reload
|
@@ -51,7 +53,6 @@ module Twitterscraper
|
|
51
53
|
proxies << ip + ':' + port
|
52
54
|
end
|
53
55
|
|
54
|
-
Twitterscraper.logger.debug "Fetch #{proxies.size} proxies"
|
55
56
|
proxies.shuffle
|
56
57
|
rescue => e
|
57
58
|
if (retries -= 1) > 0
|
data/lib/twitterscraper/query.rb
CHANGED
@@ -22,36 +22,41 @@ module Twitterscraper
|
|
22
22
|
RELOAD_URL = 'https://twitter.com/i/search/timeline?f=tweets&vertical=' +
|
23
23
|
'default&include_available_features=1&include_entities=1&' +
|
24
24
|
'reset_error_state=false&src=typd&max_position=__POS__&q=__QUERY__&l=__LANG__'
|
25
|
-
INIT_URL_USER = 'https://twitter.com/
|
26
|
-
RELOAD_URL_USER = 'https://twitter.com/i/profiles/show/
|
25
|
+
INIT_URL_USER = 'https://twitter.com/__USER__'
|
26
|
+
RELOAD_URL_USER = 'https://twitter.com/i/profiles/show/__USER__/timeline/tweets?' +
|
27
27
|
'include_available_features=1&include_entities=1&' +
|
28
|
-
'max_position=
|
29
|
-
|
30
|
-
def build_query_url(query, lang,
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
# end
|
38
|
-
if pos
|
39
|
-
RELOAD_URL.sub('__QUERY__', query).sub('__LANG__', lang.to_s).sub('__POS__', pos)
|
28
|
+
'max_position=__POS__&reset_error_state=false'
|
29
|
+
|
30
|
+
def build_query_url(query, lang, from_user, pos)
|
31
|
+
if from_user
|
32
|
+
if pos
|
33
|
+
RELOAD_URL_USER.sub('__USER__', query).sub('__POS__', pos.to_s)
|
34
|
+
else
|
35
|
+
INIT_URL_USER.sub('__USER__', query)
|
36
|
+
end
|
40
37
|
else
|
41
|
-
|
38
|
+
if pos
|
39
|
+
RELOAD_URL.sub('__QUERY__', query).sub('__LANG__', lang.to_s).sub('__POS__', pos)
|
40
|
+
else
|
41
|
+
INIT_URL.sub('__QUERY__', query).sub('__LANG__', lang.to_s)
|
42
|
+
end
|
42
43
|
end
|
43
44
|
end
|
44
45
|
|
45
46
|
def get_single_page(url, headers, proxies, timeout = 6, retries = 30)
|
46
47
|
return nil if stop_requested?
|
47
|
-
|
48
|
+
unless proxies.empty?
|
49
|
+
proxy = proxies.sample
|
50
|
+
logger.info("Using proxy #{proxy}")
|
51
|
+
end
|
52
|
+
Http.get(url, headers, proxy, timeout)
|
48
53
|
rescue => e
|
49
54
|
logger.debug "query_single_page: #{e.inspect}"
|
50
55
|
if (retries -= 1) > 0
|
51
|
-
logger.info
|
56
|
+
logger.info "Retrying... (Attempts left: #{retries - 1})"
|
52
57
|
retry
|
53
58
|
else
|
54
|
-
raise
|
59
|
+
raise Error.new("#{e.inspect} url=#{url}")
|
55
60
|
end
|
56
61
|
end
|
57
62
|
|
@@ -70,15 +75,28 @@ module Twitterscraper
|
|
70
75
|
[items_html, json_resp]
|
71
76
|
end
|
72
77
|
|
73
|
-
def query_single_page(query, lang,
|
74
|
-
logger.info
|
78
|
+
def query_single_page(query, lang, type, pos, headers: [], proxies: [])
|
79
|
+
logger.info "Querying #{query}"
|
75
80
|
query = ERB::Util.url_encode(query)
|
76
81
|
|
77
|
-
url = build_query_url(query, lang,
|
78
|
-
|
82
|
+
url = build_query_url(query, lang, type == 'user', pos)
|
83
|
+
http_request = lambda do
|
84
|
+
logger.debug "Scraping tweets from #{url}"
|
85
|
+
get_single_page(url, headers, proxies)
|
86
|
+
end
|
79
87
|
|
80
|
-
|
81
|
-
|
88
|
+
if cache_enabled?
|
89
|
+
client = Cache.new
|
90
|
+
if (response = client.read(url))
|
91
|
+
logger.debug 'Fetching tweets from cache'
|
92
|
+
else
|
93
|
+
response = http_request.call
|
94
|
+
client.write(url, response) unless stop_requested?
|
95
|
+
end
|
96
|
+
else
|
97
|
+
response = http_request.call
|
98
|
+
end
|
99
|
+
return [], nil if response.nil? || response.empty?
|
82
100
|
|
83
101
|
html, json_resp = parse_single_page(response, pos.nil?)
|
84
102
|
|
@@ -90,8 +108,8 @@ module Twitterscraper
|
|
90
108
|
|
91
109
|
if json_resp
|
92
110
|
[tweets, json_resp['min_position']]
|
93
|
-
elsif
|
94
|
-
|
111
|
+
elsif type
|
112
|
+
[tweets, tweets[-1].tweet_id]
|
95
113
|
else
|
96
114
|
[tweets, "TWEET-#{tweets[-1].tweet_id}-#{tweets[0].tweet_id}"]
|
97
115
|
end
|
@@ -99,33 +117,34 @@ module Twitterscraper
|
|
99
117
|
|
100
118
|
OLDEST_DATE = Date.parse('2006-03-21')
|
101
119
|
|
102
|
-
def validate_options!(
|
120
|
+
def validate_options!(queries, type:, start_date:, end_date:, lang:, limit:, threads:)
|
121
|
+
query = queries[0]
|
103
122
|
if query.nil? || query == ''
|
104
|
-
raise 'Please specify a search query.'
|
123
|
+
raise Error.new('Please specify a search query.')
|
105
124
|
end
|
106
125
|
|
107
126
|
if ERB::Util.url_encode(query).length >= 500
|
108
|
-
raise ':query must be a UTF-8, URL-encoded search query of 500 characters maximum, including operators.'
|
127
|
+
raise Error.new(':query must be a UTF-8, URL-encoded search query of 500 characters maximum, including operators.')
|
109
128
|
end
|
110
129
|
|
111
130
|
if start_date && end_date
|
112
131
|
if start_date == end_date
|
113
|
-
raise 'Please specify different values for :start_date and :end_date.'
|
132
|
+
raise Error.new('Please specify different values for :start_date and :end_date.')
|
114
133
|
elsif start_date > end_date
|
115
|
-
raise ':start_date must occur before :end_date.'
|
134
|
+
raise Error.new(':start_date must occur before :end_date.')
|
116
135
|
end
|
117
136
|
end
|
118
137
|
|
119
138
|
if start_date
|
120
139
|
if start_date < OLDEST_DATE
|
121
|
-
raise ":start_date must be greater than or equal to #{OLDEST_DATE}"
|
140
|
+
raise Error.new(":start_date must be greater than or equal to #{OLDEST_DATE}")
|
122
141
|
end
|
123
142
|
end
|
124
143
|
|
125
144
|
if end_date
|
126
145
|
today = Date.today
|
127
146
|
if end_date > Date.today
|
128
|
-
raise ":end_date must be less than or equal to today(#{today})"
|
147
|
+
raise Error.new(":end_date must be less than or equal to today(#{today})")
|
129
148
|
end
|
130
149
|
end
|
131
150
|
end
|
@@ -143,27 +162,32 @@ module Twitterscraper
|
|
143
162
|
end
|
144
163
|
end
|
145
164
|
|
146
|
-
def main_loop(query, lang, limit, headers, proxies)
|
165
|
+
def main_loop(query, lang, type, limit, daily_limit, headers, proxies)
|
147
166
|
pos = nil
|
167
|
+
daily_tweets = []
|
148
168
|
|
149
169
|
while true
|
150
|
-
new_tweets, new_pos = query_single_page(query, lang, pos, headers: headers, proxies: proxies)
|
170
|
+
new_tweets, new_pos = query_single_page(query, lang, type, pos, headers: headers, proxies: proxies)
|
151
171
|
unless new_tweets.empty?
|
172
|
+
daily_tweets.concat(new_tweets)
|
173
|
+
daily_tweets.uniq! { |t| t.tweet_id }
|
174
|
+
|
152
175
|
@mutex.synchronize {
|
153
176
|
@all_tweets.concat(new_tweets)
|
154
177
|
@all_tweets.uniq! { |t| t.tweet_id }
|
155
178
|
}
|
156
179
|
end
|
157
|
-
logger.info
|
180
|
+
logger.info "Got #{new_tweets.size} tweets (total #{@all_tweets.size})"
|
158
181
|
|
159
182
|
break unless new_pos
|
183
|
+
break if daily_limit && daily_tweets.size >= daily_limit
|
160
184
|
break if @all_tweets.size >= limit
|
161
185
|
|
162
186
|
pos = new_pos
|
163
187
|
end
|
164
188
|
|
165
|
-
if @all_tweets.size >= limit
|
166
|
-
logger.
|
189
|
+
if !@stop_requested && @all_tweets.size >= limit
|
190
|
+
logger.warn "The limit you specified has been reached limit=#{limit} tweets=#{@all_tweets.size}"
|
167
191
|
@stop_requested = true
|
168
192
|
end
|
169
193
|
end
|
@@ -172,37 +196,59 @@ module Twitterscraper
|
|
172
196
|
@stop_requested
|
173
197
|
end
|
174
198
|
|
175
|
-
def query_tweets(query, start_date: nil, end_date: nil, lang:
|
199
|
+
def query_tweets(query, type: 'search', start_date: nil, end_date: nil, lang: nil, limit: 100, daily_limit: nil, order: 'desc', threads: 2)
|
176
200
|
start_date = Date.parse(start_date) if start_date && start_date.is_a?(String)
|
177
201
|
end_date = Date.parse(end_date) if end_date && end_date.is_a?(String)
|
178
202
|
queries = build_queries(query, start_date, end_date)
|
179
|
-
|
180
|
-
|
203
|
+
if threads > queries.size
|
204
|
+
logger.warn 'The maximum number of :threads is the number of dates between :start_date and :end_date.'
|
205
|
+
threads = queries.size
|
206
|
+
end
|
207
|
+
if proxy_enabled?
|
208
|
+
proxies = Proxy::Pool.new
|
209
|
+
logger.debug "Fetch #{proxies.size} proxies"
|
210
|
+
else
|
211
|
+
proxies = []
|
212
|
+
logger.debug 'Proxy disabled'
|
213
|
+
end
|
214
|
+
logger.debug "Cache #{cache_enabled? ? 'enabled' : 'disabled'}"
|
181
215
|
|
182
|
-
validate_options!(queries[0], start_date: start_date, end_date: end_date, lang: lang, limit: limit, threads: threads, proxy: proxy)
|
183
216
|
|
184
|
-
|
217
|
+
validate_options!(queries, type: type, start_date: start_date, end_date: end_date, lang: lang, limit: limit, threads: threads)
|
218
|
+
|
219
|
+
logger.info "The number of threads #{threads}"
|
185
220
|
|
186
221
|
headers = {'User-Agent': USER_AGENT_LIST.sample, 'X-Requested-With': 'XMLHttpRequest'}
|
187
|
-
logger.info
|
222
|
+
logger.info "Headers #{headers}"
|
188
223
|
|
189
224
|
@all_tweets = []
|
190
225
|
@mutex = Mutex.new
|
191
226
|
@stop_requested = false
|
192
227
|
|
193
228
|
if threads > 1
|
229
|
+
Thread.abort_on_exception = true
|
230
|
+
logger.debug "Set 'Thread.abort_on_exception' to true"
|
231
|
+
|
194
232
|
Parallel.each(queries, in_threads: threads) do |query|
|
195
|
-
main_loop(query, lang, limit, headers, proxies)
|
233
|
+
main_loop(query, lang, type, limit, daily_limit, headers, proxies)
|
196
234
|
raise Parallel::Break if stop_requested?
|
197
235
|
end
|
198
236
|
else
|
199
237
|
queries.each do |query|
|
200
|
-
main_loop(query, lang, limit, headers, proxies)
|
238
|
+
main_loop(query, lang, type, limit, daily_limit, headers, proxies)
|
201
239
|
break if stop_requested?
|
202
240
|
end
|
203
241
|
end
|
204
242
|
|
205
|
-
@all_tweets.sort_by { |tweet| -tweet.created_at.to_i }
|
243
|
+
@all_tweets.sort_by { |tweet| (order == 'desc' ? -1 : 1) * tweet.created_at.to_i }
|
244
|
+
end
|
245
|
+
|
246
|
+
def search(query, start_date: nil, end_date: nil, lang: '', limit: 100, daily_limit: nil, order: 'desc', threads: 2)
|
247
|
+
query_tweets(query, type: 'search', start_date: start_date, end_date: end_date, lang: lang, limit: limit, daily_limit: daily_limit, order: order, threads: threads)
|
248
|
+
end
|
249
|
+
|
250
|
+
def user_timeline(screen_name, limit: 100, order: 'desc')
|
251
|
+
query_tweets(screen_name, type: 'user', start_date: nil, end_date: nil, lang: nil, limit: limit, daily_limit: nil, order: order, threads: 1)
|
206
252
|
end
|
207
253
|
end
|
208
254
|
end
|
data/lib/twitterscraper/tweet.rb
CHANGED
@@ -43,6 +43,14 @@ module Twitterscraper
|
|
43
43
|
end
|
44
44
|
|
45
45
|
class << self
|
46
|
+
def from_json(text)
|
47
|
+
json = JSON.parse(text)
|
48
|
+
json.map do |tweet|
|
49
|
+
tweet['created_at'] = Time.parse(tweet['created_at'])
|
50
|
+
new(tweet)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
46
54
|
def from_html(text)
|
47
55
|
html = Nokogiri::HTML(text)
|
48
56
|
from_tweets_html(html.xpath("//li[@class[contains(., 'js-stream-item')]]/div[@class[contains(., 'js-stream-tweet')]]"))
|
@@ -51,12 +59,19 @@ module Twitterscraper
|
|
51
59
|
def from_tweets_html(html)
|
52
60
|
html.map do |tweet|
|
53
61
|
from_tweet_html(tweet)
|
54
|
-
end
|
62
|
+
end.compact
|
55
63
|
end
|
56
64
|
|
57
65
|
def from_tweet_html(html)
|
66
|
+
screen_name = html.attr('data-screen-name')
|
67
|
+
tweet_id = html.attr('data-tweet-id')&.to_i
|
68
|
+
|
69
|
+
unless html.to_s.include?('js-tweet-text-container')
|
70
|
+
Twitterscraper.logger.warn "html doesn't include div.js-tweet-text-container url=https://twitter.com/#{screen_name}/status/#{tweet_id}"
|
71
|
+
return nil
|
72
|
+
end
|
73
|
+
|
58
74
|
inner_html = Nokogiri::HTML(html.inner_html)
|
59
|
-
tweet_id = html.attr('data-tweet-id').to_i
|
60
75
|
text = inner_html.xpath("//div[@class[contains(., 'js-tweet-text-container')]]/p[@class[contains(., 'js-tweet-text')]]").first.text
|
61
76
|
links = inner_html.xpath("//a[@class[contains(., 'twitter-timeline-link')]]").map { |elem| elem.attr('data-expanded-url') }.select { |link| link && !link.include?('pic.twitter') }
|
62
77
|
image_urls = inner_html.xpath("//div[@class[contains(., 'AdaptiveMedia-photoContainer')]]").map { |elem| elem.attr('data-image-url') }
|
@@ -81,7 +96,7 @@ module Twitterscraper
|
|
81
96
|
|
82
97
|
timestamp = inner_html.xpath("//span[@class[contains(., 'js-short-timestamp')]]").first.attr('data-time').to_i
|
83
98
|
new(
|
84
|
-
screen_name:
|
99
|
+
screen_name: screen_name,
|
85
100
|
name: html.attr('data-name'),
|
86
101
|
user_id: html.attr('data-user-id').to_i,
|
87
102
|
tweet_id: tweet_id,
|
data/lib/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: twitterscraper-ruby
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.15.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- ts-3156
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-07-
|
11
|
+
date: 2020-07-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -46,8 +46,10 @@ executables:
|
|
46
46
|
extensions: []
|
47
47
|
extra_rdoc_files: []
|
48
48
|
files:
|
49
|
+
- ".circleci/config.yml"
|
49
50
|
- ".gitignore"
|
50
51
|
- ".irbrc"
|
52
|
+
- ".rspec"
|
51
53
|
- ".ruby-version"
|
52
54
|
- ".travis.yml"
|
53
55
|
- CODE_OF_CONDUCT.md
|
@@ -61,6 +63,7 @@ files:
|
|
61
63
|
- bin/twitterscraper
|
62
64
|
- lib/twitterscraper-ruby.rb
|
63
65
|
- lib/twitterscraper.rb
|
66
|
+
- lib/twitterscraper/cache.rb
|
64
67
|
- lib/twitterscraper/cli.rb
|
65
68
|
- lib/twitterscraper/client.rb
|
66
69
|
- lib/twitterscraper/http.rb
|