twitterscraper-ruby 0.2.0 → 0.7.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +3 -1
- data/README.md +97 -16
- data/bin/twitterscraper +13 -0
- data/lib/twitterscraper.rb +1 -2
- data/lib/twitterscraper/cli.rb +92 -0
- data/lib/twitterscraper/http.rb +2 -1
- data/lib/twitterscraper/proxy.rb +34 -3
- data/lib/twitterscraper/query.rb +130 -69
- data/lib/twitterscraper/tweet.rb +8 -2
- data/lib/version.rb +1 -1
- data/twitterscraper-ruby.gemspec +4 -2
- metadata +22 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9cfd03782734642da8ac29839788f142399d2a3f4ec601e8b6f47ae1ca38c17f
|
4
|
+
data.tar.gz: 07a398e51fd2fbdc735ae27008d9a23e97dc390632179738045db4c81bd4fcad
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6f417fe3379a3d9d134c308a9ea9d4e01b458018c9c5a3f8508a85e7f5890d01991838cfcabe87b8246f69edf4458c66d17924359798017907862071353f643d
|
7
|
+
data.tar.gz: 758bcb55ded936c3696f99647f64bc9921386b3cb0c783c218510c0e36991ae6b95a9d08fa071e02072c8b727bbadb6674ceeb19a74e356a842d62c1ec4c038f
|
data/Gemfile.lock
CHANGED
@@ -1,8 +1,9 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
twitterscraper-ruby (0.
|
4
|
+
twitterscraper-ruby (0.7.0)
|
5
5
|
nokogiri
|
6
|
+
parallel
|
6
7
|
|
7
8
|
GEM
|
8
9
|
remote: https://rubygems.org/
|
@@ -11,6 +12,7 @@ GEM
|
|
11
12
|
minitest (5.14.1)
|
12
13
|
nokogiri (1.10.10)
|
13
14
|
mini_portile2 (~> 2.4.0)
|
15
|
+
parallel (1.19.2)
|
14
16
|
rake (12.3.3)
|
15
17
|
|
16
18
|
PLATFORMS
|
data/README.md
CHANGED
@@ -1,46 +1,127 @@
|
|
1
1
|
# twitterscraper-ruby
|
2
2
|
|
3
|
-
|
3
|
+
[![Gem Version](https://badge.fury.io/rb/twitterscraper-ruby.svg)](https://badge.fury.io/rb/twitterscraper-ruby)
|
4
4
|
|
5
|
-
|
5
|
+
A gem to scrape https://twitter.com/search. This gem is inspired by [taspinar/twitterscraper](https://github.com/taspinar/twitterscraper).
|
6
6
|
|
7
|
-
## Installation
|
8
7
|
|
9
|
-
|
8
|
+
## Twitter Search API vs. twitterscraper-ruby
|
10
9
|
|
11
|
-
|
12
|
-
|
13
|
-
|
10
|
+
### Twitter Search API
|
11
|
+
|
12
|
+
- The number of tweets: 180 - 450 requests/15 minutes (18,000 - 45,000 tweets/15 minutes)
|
13
|
+
- The time window: the past 7 days
|
14
|
+
|
15
|
+
### twitterscraper-ruby
|
16
|
+
|
17
|
+
- The number of tweets: Unlimited
|
18
|
+
- The time window: from 2006-3-21 to today
|
14
19
|
|
15
|
-
And then execute:
|
16
20
|
|
17
|
-
|
21
|
+
## Installation
|
18
22
|
|
19
|
-
|
23
|
+
First install the library:
|
20
24
|
|
21
|
-
|
25
|
+
```shell script
|
26
|
+
$ gem install twitterscraper-ruby
|
27
|
+
````
|
28
|
+
|
22
29
|
|
23
30
|
## Usage
|
24
31
|
|
32
|
+
Command-line interface:
|
33
|
+
|
34
|
+
```shell script
|
35
|
+
$ twitterscraper --query KEYWORD --start_date 2020-06-01 --end_date 2020-06-30 --lang ja \
|
36
|
+
--limit 100 --threads 10 --proxy --output output.json
|
37
|
+
```
|
38
|
+
|
39
|
+
From Within Ruby:
|
40
|
+
|
25
41
|
```ruby
|
26
42
|
require 'twitterscraper'
|
43
|
+
|
44
|
+
options = {
|
45
|
+
start_date: '2020-06-01',
|
46
|
+
end_date: '2020-06-30',
|
47
|
+
lang: 'ja',
|
48
|
+
limit: 100,
|
49
|
+
threads: 10,
|
50
|
+
proxy: true
|
51
|
+
}
|
52
|
+
|
53
|
+
client = Twitterscraper::Client.new
|
54
|
+
tweets = client.query_tweets(KEYWORD, options)
|
55
|
+
|
56
|
+
tweets.each do |tweet|
|
57
|
+
puts tweet.tweet_id
|
58
|
+
puts tweet.text
|
59
|
+
puts tweet.created_at
|
60
|
+
puts tweet.tweet_url
|
61
|
+
end
|
27
62
|
```
|
28
63
|
|
29
|
-
## Development
|
30
64
|
|
31
|
-
|
65
|
+
## Examples
|
66
|
+
|
67
|
+
```shell script
|
68
|
+
$ twitterscraper --query twitter --limit 1000
|
69
|
+
$ cat tweets.json | jq . | less
|
70
|
+
```
|
71
|
+
|
72
|
+
```json
|
73
|
+
[
|
74
|
+
{
|
75
|
+
"screen_name": "@screenname",
|
76
|
+
"name": "name",
|
77
|
+
"user_id": 1194529546483000000,
|
78
|
+
"tweet_id": 1282659891992000000,
|
79
|
+
"tweet_url": "https://twitter.com/screenname/status/1282659891992000000",
|
80
|
+
"created_at": "2020-07-13 12:00:00 +0000",
|
81
|
+
"text": "Thanks Twitter!"
|
82
|
+
},
|
83
|
+
...
|
84
|
+
]
|
85
|
+
```
|
86
|
+
|
87
|
+
## Attributes
|
88
|
+
|
89
|
+
### Tweet
|
90
|
+
|
91
|
+
- tweet_id
|
92
|
+
- text
|
93
|
+
- user_id
|
94
|
+
- screen_name
|
95
|
+
- name
|
96
|
+
- tweet_url
|
97
|
+
- created_at
|
98
|
+
|
99
|
+
|
100
|
+
## CLI Options
|
101
|
+
|
102
|
+
| Option | Description | Default |
|
103
|
+
| ------------- | ------------- | ------------- |
|
104
|
+
| `-h`, `--help` | This option displays a summary of twitterscraper. | |
|
105
|
+
| `--query` | Specify a keyword used during the search. | |
|
106
|
+
| `--start_date` | Set the date from which twitterscraper-ruby should start scraping for your query. | |
|
107
|
+
| `--end_date` | Set the enddate which twitterscraper-ruby should use to stop scraping for your query. | |
|
108
|
+
| `--lang` | Retrieve tweets written in a specific language. | |
|
109
|
+
| `--limit` | Stop scraping when *at least* the number of tweets indicated with --limit is scraped. | 100 |
|
110
|
+
| `--threads` | Set the number of threads twitterscraper-ruby should initiate while scraping for your query. | 2 |
|
111
|
+
| `--proxy` | Scrape https://twitter.com/search via proxies. | false |
|
112
|
+
| `--output` | The name of the output file. | tweets.json |
|
32
113
|
|
33
|
-
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
34
114
|
|
35
115
|
## Contributing
|
36
116
|
|
37
|
-
Bug reports and pull requests are welcome on GitHub at https://github.com/
|
117
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/ts-3156/twitterscraper-ruby. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [code of conduct](https://github.com/ts-3156/twitterscraper-ruby/blob/master/CODE_OF_CONDUCT.md).
|
38
118
|
|
39
119
|
|
40
120
|
## License
|
41
121
|
|
42
122
|
The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
|
43
123
|
|
124
|
+
|
44
125
|
## Code of Conduct
|
45
126
|
|
46
|
-
Everyone interacting in the
|
127
|
+
Everyone interacting in the twitterscraper-ruby project's codebases, issue trackers, chat rooms and mailing lists is expected to follow the [code of conduct](https://github.com/ts-3156/twitterscraper-ruby/blob/master/CODE_OF_CONDUCT.md).
|
data/bin/twitterscraper
ADDED
data/lib/twitterscraper.rb
CHANGED
@@ -9,10 +9,9 @@ require 'version'
|
|
9
9
|
|
10
10
|
module Twitterscraper
|
11
11
|
class Error < StandardError; end
|
12
|
-
# Your code goes here...
|
13
12
|
|
14
13
|
def self.logger
|
15
|
-
@logger ||= ::Logger.new(STDOUT)
|
14
|
+
@logger ||= ::Logger.new(STDOUT, level: ::Logger::INFO)
|
16
15
|
end
|
17
16
|
|
18
17
|
def self.logger=(logger)
|
@@ -0,0 +1,92 @@
|
|
1
|
+
$stdout.sync = true
|
2
|
+
|
3
|
+
require 'json'
|
4
|
+
require 'optparse'
|
5
|
+
require 'twitterscraper'
|
6
|
+
|
7
|
+
module Twitterscraper
|
8
|
+
class Cli
|
9
|
+
def parse
|
10
|
+
@options = parse_options(ARGV)
|
11
|
+
initialize_logger
|
12
|
+
end
|
13
|
+
|
14
|
+
def run
|
15
|
+
print_help || return if print_help?
|
16
|
+
print_version || return if print_version?
|
17
|
+
|
18
|
+
query_options = {
|
19
|
+
start_date: options['start_date'],
|
20
|
+
end_date: options['end_date'],
|
21
|
+
lang: options['lang'],
|
22
|
+
limit: options['limit'],
|
23
|
+
threads: options['threads'],
|
24
|
+
proxy: options['proxy']
|
25
|
+
}
|
26
|
+
client = Twitterscraper::Client.new
|
27
|
+
tweets = client.query_tweets(options['query'], query_options)
|
28
|
+
File.write(options['output'], generate_json(tweets))
|
29
|
+
end
|
30
|
+
|
31
|
+
def generate_json(tweets)
|
32
|
+
if options['pretty']
|
33
|
+
::JSON.pretty_generate(tweets)
|
34
|
+
else
|
35
|
+
::JSON.generate(tweets)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def options
|
40
|
+
@options
|
41
|
+
end
|
42
|
+
|
43
|
+
def parse_options(argv)
|
44
|
+
options = argv.getopts(
|
45
|
+
'h',
|
46
|
+
'help',
|
47
|
+
'v',
|
48
|
+
'version',
|
49
|
+
'query:',
|
50
|
+
'start_date:',
|
51
|
+
'end_date:',
|
52
|
+
'lang:',
|
53
|
+
'limit:',
|
54
|
+
'threads:',
|
55
|
+
'output:',
|
56
|
+
'proxy',
|
57
|
+
'pretty',
|
58
|
+
'verbose',
|
59
|
+
)
|
60
|
+
|
61
|
+
options['lang'] ||= ''
|
62
|
+
options['limit'] = (options['limit'] || 100).to_i
|
63
|
+
options['threads'] = (options['threads'] || 2).to_i
|
64
|
+
options['output'] ||= 'tweets.json'
|
65
|
+
|
66
|
+
options
|
67
|
+
end
|
68
|
+
|
69
|
+
def initialize_logger
|
70
|
+
Twitterscraper.logger.level = ::Logger::DEBUG if options['verbose']
|
71
|
+
end
|
72
|
+
|
73
|
+
def print_help?
|
74
|
+
options['h'] || options['help']
|
75
|
+
end
|
76
|
+
|
77
|
+
def print_help
|
78
|
+
puts <<~'SHELL'
|
79
|
+
Usage:
|
80
|
+
twitterscraper --query KEYWORD --limit 100 --threads 10 --start_date 2020-07-01 --end_date 2020-07-10 --lang ja --proxy --output output.json
|
81
|
+
SHELL
|
82
|
+
end
|
83
|
+
|
84
|
+
def print_version?
|
85
|
+
options['v'] || options['version']
|
86
|
+
end
|
87
|
+
|
88
|
+
def print_version
|
89
|
+
puts "twitterscraper-#{Twitterscraper::VERSION}"
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
data/lib/twitterscraper/http.rb
CHANGED
data/lib/twitterscraper/proxy.rb
CHANGED
@@ -6,22 +6,53 @@ module Twitterscraper
|
|
6
6
|
class RetryExhausted < StandardError
|
7
7
|
end
|
8
8
|
|
9
|
+
class Pool
|
10
|
+
def initialize
|
11
|
+
@items = Proxy.get_proxies
|
12
|
+
@cur_index = 0
|
13
|
+
end
|
14
|
+
|
15
|
+
def sample
|
16
|
+
if @cur_index >= @items.size
|
17
|
+
reload
|
18
|
+
end
|
19
|
+
@cur_index += 1
|
20
|
+
item = @items[@cur_index - 1]
|
21
|
+
Twitterscraper.logger.info("Using proxy #{item}")
|
22
|
+
item
|
23
|
+
end
|
24
|
+
|
25
|
+
def size
|
26
|
+
@items.size
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
|
31
|
+
def reload
|
32
|
+
@items = Proxy.get_proxies
|
33
|
+
@cur_index = 0
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
9
37
|
module_function
|
10
38
|
|
11
39
|
def get_proxies(retries = 3)
|
12
40
|
response = Twitterscraper::Http.get(PROXY_URL)
|
13
41
|
html = Nokogiri::HTML(response)
|
14
|
-
table = html.xpath('
|
42
|
+
table = html.xpath('//table[@id="proxylisttable"]').first
|
15
43
|
|
16
44
|
proxies = []
|
17
45
|
|
18
46
|
table.xpath('tbody/tr').each do |tr|
|
19
47
|
cells = tr.xpath('td')
|
20
|
-
ip, port =
|
48
|
+
ip, port, anonymity, https = [0, 1, 4, 6].map { |i| cells[i].text.strip }
|
49
|
+
next unless ['elite proxy', 'anonymous'].include?(anonymity)
|
50
|
+
next if https == 'no'
|
21
51
|
proxies << ip + ':' + port
|
22
52
|
end
|
23
53
|
|
24
|
-
proxies
|
54
|
+
Twitterscraper.logger.debug "Fetch #{proxies.size} proxies"
|
55
|
+
proxies.shuffle
|
25
56
|
rescue => e
|
26
57
|
if (retries -= 1) > 0
|
27
58
|
retry
|
data/lib/twitterscraper/query.rb
CHANGED
@@ -1,7 +1,10 @@
|
|
1
|
+
require 'resolv-replace'
|
1
2
|
require 'net/http'
|
2
3
|
require 'nokogiri'
|
3
4
|
require 'date'
|
4
5
|
require 'json'
|
6
|
+
require 'erb'
|
7
|
+
require 'parallel'
|
5
8
|
|
6
9
|
module Twitterscraper
|
7
10
|
module Query
|
@@ -14,7 +17,6 @@ module Twitterscraper
|
|
14
17
|
'Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16',
|
15
18
|
'Mozilla/5.0 (Windows NT 5.2; RW; rv:7.0a1) Gecko/20091211 SeaMonkey/9.23a1pre',
|
16
19
|
]
|
17
|
-
USER_AGENT = USER_AGENT_LIST.sample
|
18
20
|
|
19
21
|
INIT_URL = 'https://twitter.com/search?f=tweets&vertical=default&q=__QUERY__&l=__LANG__'
|
20
22
|
RELOAD_URL = 'https://twitter.com/i/search/timeline?f=tweets&vertical=' +
|
@@ -25,7 +27,7 @@ module Twitterscraper
|
|
25
27
|
'include_available_features=1&include_entities=1&' +
|
26
28
|
'max_position={pos}&reset_error_state=false'
|
27
29
|
|
28
|
-
def
|
30
|
+
def build_query_url(query, lang, pos, from_user = false)
|
29
31
|
# if from_user
|
30
32
|
# if !pos
|
31
33
|
# INIT_URL_USER.format(u = query)
|
@@ -40,52 +42,50 @@ module Twitterscraper
|
|
40
42
|
end
|
41
43
|
end
|
42
44
|
|
43
|
-
def
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
logger.info("Using proxy #{proxy}")
|
54
|
-
|
55
|
-
response = Twitterscraper::Http.get(url, headers, proxy, timeout)
|
56
|
-
rescue => e
|
57
|
-
logger.debug "query_single_page: #{e.inspect}"
|
58
|
-
if (retries -= 1) > 0
|
59
|
-
logger.info("Retrying... (Attempts left: #{retries - 1})")
|
60
|
-
retry
|
61
|
-
else
|
62
|
-
raise
|
63
|
-
end
|
45
|
+
def get_single_page(url, headers, proxies, timeout = 6, retries = 30)
|
46
|
+
return nil if stop_requested?
|
47
|
+
Twitterscraper::Http.get(url, headers, proxies.sample, timeout)
|
48
|
+
rescue => e
|
49
|
+
logger.debug "query_single_page: #{e.inspect}"
|
50
|
+
if (retries -= 1) > 0
|
51
|
+
logger.info("Retrying... (Attempts left: #{retries - 1})")
|
52
|
+
retry
|
53
|
+
else
|
54
|
+
raise
|
64
55
|
end
|
56
|
+
end
|
65
57
|
|
66
|
-
|
67
|
-
|
58
|
+
def parse_single_page(text, html = true)
|
59
|
+
return [nil, nil] if text.nil? || text == ''
|
68
60
|
|
69
|
-
if
|
70
|
-
|
71
|
-
|
72
|
-
html = json_resp['items_html'] || ''
|
73
|
-
rescue => e
|
74
|
-
logger.warn("Failed to parse JSON #{e.inspect} while requesting #{url}")
|
75
|
-
end
|
61
|
+
if html
|
62
|
+
json_resp = nil
|
63
|
+
items_html = text
|
76
64
|
else
|
77
|
-
|
65
|
+
json_resp = JSON.parse(text)
|
66
|
+
items_html = json_resp['items_html'] || ''
|
67
|
+
logger.warn json_resp['message'] if json_resp['message'] # Sorry, you are rate limited.
|
78
68
|
end
|
79
69
|
|
70
|
+
[items_html, json_resp]
|
71
|
+
end
|
72
|
+
|
73
|
+
def query_single_page(query, lang, pos, from_user = false, headers: [], proxies: [])
|
74
|
+
logger.info("Querying #{query}")
|
75
|
+
query = ERB::Util.url_encode(query)
|
76
|
+
|
77
|
+
url = build_query_url(query, lang, pos, from_user)
|
78
|
+
logger.debug("Scraping tweets from #{url}")
|
79
|
+
|
80
|
+
response = get_single_page(url, headers, proxies)
|
81
|
+
return [], nil if response.nil?
|
82
|
+
|
83
|
+
html, json_resp = parse_single_page(response, pos.nil?)
|
84
|
+
|
80
85
|
tweets = Tweet.from_html(html)
|
81
86
|
|
82
87
|
if tweets.empty?
|
83
|
-
|
84
|
-
pos = json_resp['min_position']
|
85
|
-
else
|
86
|
-
pos = nil
|
87
|
-
end
|
88
|
-
return [], pos
|
88
|
+
return [], (json_resp && json_resp['has_more_items'] && json_resp['min_position'])
|
89
89
|
end
|
90
90
|
|
91
91
|
if json_resp
|
@@ -97,51 +97,112 @@ module Twitterscraper
|
|
97
97
|
end
|
98
98
|
end
|
99
99
|
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
if
|
104
|
-
raise 'Please specify
|
105
|
-
|
106
|
-
|
100
|
+
OLDEST_DATE = Date.parse('2006-3-21')
|
101
|
+
|
102
|
+
def validate_options!(query, start_date:, end_date:, lang:, limit:, threads:, proxy:)
|
103
|
+
if query.nil? || query == ''
|
104
|
+
raise 'Please specify a search query.'
|
105
|
+
end
|
106
|
+
|
107
|
+
if ERB::Util.url_encode(query).length >= 500
|
108
|
+
raise ':query must be a UTF-8, URL-encoded search query of 500 characters maximum, including operators.'
|
107
109
|
end
|
108
110
|
|
109
|
-
|
111
|
+
if start_date && end_date
|
112
|
+
if start_date == end_date
|
113
|
+
raise 'Please specify different values for :start_date and :end_date.'
|
114
|
+
elsif start_date > end_date
|
115
|
+
raise ':start_date must occur before :end_date.'
|
116
|
+
end
|
117
|
+
end
|
110
118
|
|
119
|
+
if start_date
|
120
|
+
if start_date < OLDEST_DATE
|
121
|
+
raise ":start_date must be greater than or equal to #{OLDEST_DATE}"
|
122
|
+
end
|
123
|
+
end
|
124
|
+
|
125
|
+
if end_date
|
126
|
+
today = Date.today
|
127
|
+
if end_date > Date.today
|
128
|
+
raise ":end_date must be less than or equal to today(#{today})"
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
def build_queries(query, start_date, end_date)
|
134
|
+
if start_date && end_date
|
135
|
+
date_range = start_date.upto(end_date - 1)
|
136
|
+
date_range.map { |date| query + " since:#{date} until:#{date + 1}" }
|
137
|
+
elsif start_date
|
138
|
+
[query + " since:#{start_date}"]
|
139
|
+
elsif end_date
|
140
|
+
[query + " until:#{end_date}"]
|
141
|
+
else
|
142
|
+
[query]
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
def main_loop(query, lang, limit, headers, proxies)
|
111
147
|
pos = nil
|
112
|
-
all_tweets = []
|
113
148
|
|
114
|
-
|
115
|
-
|
149
|
+
while true
|
150
|
+
new_tweets, new_pos = query_single_page(query, lang, pos, headers: headers, proxies: proxies)
|
151
|
+
unless new_tweets.empty?
|
152
|
+
@mutex.synchronize {
|
153
|
+
@all_tweets.concat(new_tweets)
|
154
|
+
@all_tweets.uniq! { |t| t.tweet_id }
|
155
|
+
}
|
156
|
+
end
|
157
|
+
logger.info("Got #{new_tweets.size} tweets (total #{@all_tweets.size})")
|
116
158
|
|
117
|
-
|
118
|
-
|
159
|
+
break unless new_pos
|
160
|
+
break if @all_tweets.size >= limit
|
119
161
|
|
120
|
-
|
121
|
-
|
162
|
+
pos = new_pos
|
163
|
+
end
|
122
164
|
|
123
|
-
|
165
|
+
if @all_tweets.size >= limit
|
166
|
+
logger.info("Limit reached #{@all_tweets.size}")
|
167
|
+
@stop_requested = true
|
168
|
+
end
|
169
|
+
end
|
124
170
|
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
logger.debug("new_pos=#{new_pos}")
|
171
|
+
def stop_requested?
|
172
|
+
@stop_requested
|
173
|
+
end
|
129
174
|
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
175
|
+
def query_tweets(query, start_date: nil, end_date: nil, lang: '', limit: 100, threads: 2, proxy: false)
|
176
|
+
start_date = Date.parse(start_date) if start_date && start_date.is_a?(String)
|
177
|
+
end_date = Date.parse(end_date) if end_date && end_date.is_a?(String)
|
178
|
+
queries = build_queries(query, start_date, end_date)
|
179
|
+
threads = queries.size if threads > queries.size
|
180
|
+
proxies = proxy ? Twitterscraper::Proxy::Pool.new : []
|
134
181
|
|
135
|
-
|
136
|
-
break if all_tweets.size >= limit
|
182
|
+
validate_options!(queries[0], start_date: start_date, end_date: end_date, lang: lang, limit: limit, threads: threads, proxy: proxy)
|
137
183
|
|
138
|
-
|
139
|
-
end
|
184
|
+
logger.info("The number of threads #{threads}")
|
140
185
|
|
141
|
-
|
186
|
+
headers = {'User-Agent': USER_AGENT_LIST.sample, 'X-Requested-With': 'XMLHttpRequest'}
|
187
|
+
logger.info("Headers #{headers}")
|
188
|
+
|
189
|
+
@all_tweets = []
|
190
|
+
@mutex = Mutex.new
|
191
|
+
@stop_requested = false
|
192
|
+
|
193
|
+
if threads > 1
|
194
|
+
Parallel.each(queries, in_threads: threads) do |query|
|
195
|
+
main_loop(query, lang, limit, headers, proxies)
|
196
|
+
raise Parallel::Break if stop_requested?
|
197
|
+
end
|
198
|
+
else
|
199
|
+
queries.each do |query|
|
200
|
+
main_loop(query, lang, limit, headers, proxies)
|
201
|
+
break if stop_requested?
|
202
|
+
end
|
142
203
|
end
|
143
204
|
|
144
|
-
all_tweets
|
205
|
+
@all_tweets.sort_by { |tweet| -tweet.created_at.to_i }
|
145
206
|
end
|
146
207
|
end
|
147
208
|
end
|
data/lib/twitterscraper/tweet.rb
CHANGED
@@ -2,7 +2,8 @@ require 'time'
|
|
2
2
|
|
3
3
|
module Twitterscraper
|
4
4
|
class Tweet
|
5
|
-
|
5
|
+
KEYS = [:screen_name, :name, :user_id, :tweet_id, :tweet_url, :created_at, :text]
|
6
|
+
attr_reader *KEYS
|
6
7
|
|
7
8
|
def initialize(attrs)
|
8
9
|
attrs.each do |key, value|
|
@@ -10,6 +11,12 @@ module Twitterscraper
|
|
10
11
|
end
|
11
12
|
end
|
12
13
|
|
14
|
+
def to_json(options = {})
|
15
|
+
KEYS.map do |key|
|
16
|
+
[key, send(key)]
|
17
|
+
end.to_h.to_json
|
18
|
+
end
|
19
|
+
|
13
20
|
class << self
|
14
21
|
def from_html(text)
|
15
22
|
html = Nokogiri::HTML(text)
|
@@ -31,7 +38,6 @@ module Twitterscraper
|
|
31
38
|
user_id: html.attr('data-user-id').to_i,
|
32
39
|
tweet_id: html.attr('data-tweet-id').to_i,
|
33
40
|
tweet_url: 'https://twitter.com' + html.attr('data-permalink-path'),
|
34
|
-
timestamp: timestamp,
|
35
41
|
created_at: Time.at(timestamp, in: '+00:00'),
|
36
42
|
text: inner_html.xpath("//div[@class[contains(., 'js-tweet-text-container')]]/p[@class[contains(., 'js-tweet-text')]]").first.text,
|
37
43
|
)
|
data/lib/version.rb
CHANGED
data/twitterscraper-ruby.gemspec
CHANGED
@@ -21,9 +21,11 @@ Gem::Specification.new do |spec|
|
|
21
21
|
spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
|
22
22
|
`git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
23
23
|
end
|
24
|
-
spec.
|
25
|
-
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
24
|
+
spec.executables = ["twitterscraper"]
|
26
25
|
spec.require_paths = ["lib"]
|
27
26
|
|
27
|
+
spec.required_ruby_version = ">= 2.6.4"
|
28
|
+
|
28
29
|
spec.add_dependency "nokogiri"
|
30
|
+
spec.add_dependency "parallel"
|
29
31
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: twitterscraper-ruby
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.7.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- ts-3156
|
8
8
|
autorequire:
|
9
|
-
bindir:
|
9
|
+
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-07-
|
11
|
+
date: 2020-07-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -24,10 +24,25 @@ dependencies:
|
|
24
24
|
- - ">="
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: parallel
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
27
41
|
description: A gem to scrape Tweets
|
28
42
|
email:
|
29
43
|
- ts_3156@yahoo.co.jp
|
30
|
-
executables:
|
44
|
+
executables:
|
45
|
+
- twitterscraper
|
31
46
|
extensions: []
|
32
47
|
extra_rdoc_files: []
|
33
48
|
files:
|
@@ -43,8 +58,10 @@ files:
|
|
43
58
|
- Rakefile
|
44
59
|
- bin/console
|
45
60
|
- bin/setup
|
61
|
+
- bin/twitterscraper
|
46
62
|
- lib/twitterscraper-ruby.rb
|
47
63
|
- lib/twitterscraper.rb
|
64
|
+
- lib/twitterscraper/cli.rb
|
48
65
|
- lib/twitterscraper/client.rb
|
49
66
|
- lib/twitterscraper/http.rb
|
50
67
|
- lib/twitterscraper/lang.rb
|
@@ -69,7 +86,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
69
86
|
requirements:
|
70
87
|
- - ">="
|
71
88
|
- !ruby/object:Gem::Version
|
72
|
-
version: 2.
|
89
|
+
version: 2.6.4
|
73
90
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
74
91
|
requirements:
|
75
92
|
- - ">="
|