twitterscraper-ruby 0.4.0 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +3 -1
- data/README.md +131 -16
- data/lib/twitterscraper.rb +1 -2
- data/lib/twitterscraper/cli.rb +58 -9
- data/lib/twitterscraper/http.rb +0 -1
- data/lib/twitterscraper/proxy.rb +10 -8
- data/lib/twitterscraper/query.rb +102 -34
- data/lib/twitterscraper/tweet.rb +66 -5
- data/lib/version.rb +1 -1
- data/twitterscraper-ruby.gemspec +1 -0
- metadata +16 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 59b71fc6129f6d8c5a441981dc1577fa9b761380ff119bed4985cfcd88ccb31b
|
4
|
+
data.tar.gz: 2de3fcadc334ee2689d3083ea9324127c3b22ec94cf1b08dec920f9c95771445
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b1e392bc021f6f758b79b7bdcd099af2ac391863f8712dadb5fd19248946867cfd89f140b836532fb40554c82697b26ef3af00b7cbb2cb13b0d5a8e2a38c87e7
|
7
|
+
data.tar.gz: 8c0e81589202e4a094c17604354f0f23a08b4536fe60b58ffe616cf1233c0531547ef02b8e88b6f70b1870ce2d134e4518ee093a5349144e2edfce3b1088e06c
|
data/Gemfile.lock
CHANGED
@@ -1,8 +1,9 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
twitterscraper-ruby (0.
|
4
|
+
twitterscraper-ruby (0.9.0)
|
5
5
|
nokogiri
|
6
|
+
parallel
|
6
7
|
|
7
8
|
GEM
|
8
9
|
remote: https://rubygems.org/
|
@@ -11,6 +12,7 @@ GEM
|
|
11
12
|
minitest (5.14.1)
|
12
13
|
nokogiri (1.10.10)
|
13
14
|
mini_portile2 (~> 2.4.0)
|
15
|
+
parallel (1.19.2)
|
14
16
|
rake (12.3.3)
|
15
17
|
|
16
18
|
PLATFORMS
|
data/README.md
CHANGED
@@ -1,46 +1,161 @@
|
|
1
1
|
# twitterscraper-ruby
|
2
2
|
|
3
|
-
|
3
|
+
[](https://badge.fury.io/rb/twitterscraper-ruby)
|
4
4
|
|
5
|
-
|
5
|
+
A gem to scrape https://twitter.com/search. This gem is inspired by [taspinar/twitterscraper](https://github.com/taspinar/twitterscraper).
|
6
6
|
|
7
|
-
## Installation
|
8
7
|
|
9
|
-
|
8
|
+
## Twitter Search API vs. twitterscraper-ruby
|
10
9
|
|
11
|
-
|
12
|
-
gem 'twitterscraper-ruby'
|
13
|
-
```
|
10
|
+
### Twitter Search API
|
14
11
|
|
15
|
-
|
12
|
+
- The number of tweets: 180 - 450 requests/15 minutes (18,000 - 45,000 tweets/15 minutes)
|
13
|
+
- The time window: the past 7 days
|
16
14
|
|
17
|
-
|
15
|
+
### twitterscraper-ruby
|
18
16
|
|
19
|
-
|
17
|
+
- The number of tweets: Unlimited
|
18
|
+
- The time window: from 2006-3-21 to today
|
20
19
|
|
21
|
-
|
20
|
+
|
21
|
+
## Installation
|
22
|
+
|
23
|
+
First install the library:
|
24
|
+
|
25
|
+
```shell script
|
26
|
+
$ gem install twitterscraper-ruby
|
27
|
+
````
|
28
|
+
|
22
29
|
|
23
30
|
## Usage
|
24
31
|
|
32
|
+
Command-line interface:
|
33
|
+
|
34
|
+
```shell script
|
35
|
+
$ twitterscraper --query KEYWORD --start_date 2020-06-01 --end_date 2020-06-30 --lang ja \
|
36
|
+
--limit 100 --threads 10 --proxy --output output.json
|
37
|
+
```
|
38
|
+
|
39
|
+
From Within Ruby:
|
40
|
+
|
25
41
|
```ruby
|
26
42
|
require 'twitterscraper'
|
43
|
+
|
44
|
+
options = {
|
45
|
+
start_date: '2020-06-01',
|
46
|
+
end_date: '2020-06-30',
|
47
|
+
lang: 'ja',
|
48
|
+
limit: 100,
|
49
|
+
threads: 10,
|
50
|
+
proxy: true
|
51
|
+
}
|
52
|
+
|
53
|
+
client = Twitterscraper::Client.new
|
54
|
+
tweets = client.query_tweets(KEYWORD, options)
|
55
|
+
|
56
|
+
tweets.each do |tweet|
|
57
|
+
puts tweet.tweet_id
|
58
|
+
puts tweet.text
|
59
|
+
puts tweet.tweet_url
|
60
|
+
puts tweet.created_at
|
61
|
+
|
62
|
+
hash = tweet.attrs
|
63
|
+
puts hash.keys
|
64
|
+
end
|
27
65
|
```
|
28
66
|
|
29
|
-
## Development
|
30
67
|
|
31
|
-
|
68
|
+
## Attributes
|
69
|
+
|
70
|
+
### Tweet
|
71
|
+
|
72
|
+
- screen_name
|
73
|
+
- name
|
74
|
+
- user_id
|
75
|
+
- tweet_id
|
76
|
+
- text
|
77
|
+
- links
|
78
|
+
- hashtags
|
79
|
+
- image_urls
|
80
|
+
- video_url
|
81
|
+
- has_media
|
82
|
+
- likes
|
83
|
+
- retweets
|
84
|
+
- replies
|
85
|
+
- is_replied
|
86
|
+
- is_reply_to
|
87
|
+
- parent_tweet_id
|
88
|
+
- reply_to_users
|
89
|
+
- tweet_url
|
90
|
+
- created_at
|
91
|
+
|
92
|
+
|
93
|
+
## Search operators
|
94
|
+
|
95
|
+
| Operator | Finds Tweets... |
|
96
|
+
| ------------- | ------------- |
|
97
|
+
| watching now | containing both "watching" and "now". This is the default operator. |
|
98
|
+
| "happy hour" | containing the exact phrase "happy hour". |
|
99
|
+
| love OR hate | containing either "love" or "hate" (or both). |
|
100
|
+
| beer -root | containing "beer" but not "root". |
|
101
|
+
| #haiku | containing the hashtag "haiku". |
|
102
|
+
| from:interior | sent from Twitter account "interior". |
|
103
|
+
| to:NASA | a Tweet authored in reply to Twitter account "NASA". |
|
104
|
+
| @NASA | mentioning Twitter account "NASA". |
|
105
|
+
| puppy filter:media | containing "puppy" and an image or video. |
|
106
|
+
| puppy -filter:retweets | containing "puppy", filtering out retweets |
|
107
|
+
| superhero since:2015-12-21 | containing "superhero" and sent since date "2015-12-21" (year-month-day). |
|
108
|
+
| puppy until:2015-12-21 | containing "puppy" and sent before the date "2015-12-21". |
|
109
|
+
|
110
|
+
Search operators documentation is in [Standard search operators](https://developer.twitter.com/en/docs/tweets/rules-and-filtering/overview/standard-operators).
|
111
|
+
|
112
|
+
|
113
|
+
## Examples
|
114
|
+
|
115
|
+
```shell script
|
116
|
+
$ twitterscraper --query twitter --limit 1000
|
117
|
+
$ cat tweets.json | jq . | less
|
118
|
+
```
|
119
|
+
|
120
|
+
```json
|
121
|
+
[
|
122
|
+
{
|
123
|
+
"screen_name": "@screenname",
|
124
|
+
"name": "name",
|
125
|
+
"user_id": 1194529546483000000,
|
126
|
+
"tweet_id": 1282659891992000000,
|
127
|
+
"tweet_url": "https://twitter.com/screenname/status/1282659891992000000",
|
128
|
+
"created_at": "2020-07-13 12:00:00 +0000",
|
129
|
+
"text": "Thanks Twitter!"
|
130
|
+
}
|
131
|
+
]
|
132
|
+
```
|
133
|
+
|
134
|
+
## CLI Options
|
135
|
+
|
136
|
+
| Option | Description | Default |
|
137
|
+
| ------------- | ------------- | ------------- |
|
138
|
+
| `-h`, `--help` | This option displays a summary of twitterscraper. | |
|
139
|
+
| `--query` | Specify a keyword used during the search. | |
|
140
|
+
| `--start_date` | Set the date from which twitterscraper-ruby should start scraping for your query. | |
|
141
|
+
| `--end_date` | Set the enddate which twitterscraper-ruby should use to stop scraping for your query. | |
|
142
|
+
| `--lang` | Retrieve tweets written in a specific language. | |
|
143
|
+
| `--limit` | Stop scraping when *at least* the number of tweets indicated with --limit is scraped. | 100 |
|
144
|
+
| `--threads` | Set the number of threads twitterscraper-ruby should initiate while scraping for your query. | 2 |
|
145
|
+
| `--proxy` | Scrape https://twitter.com/search via proxies. | false |
|
146
|
+
| `--output` | The name of the output file. | tweets.json |
|
32
147
|
|
33
|
-
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
34
148
|
|
35
149
|
## Contributing
|
36
150
|
|
37
|
-
Bug reports and pull requests are welcome on GitHub at https://github.com/
|
151
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/ts-3156/twitterscraper-ruby. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [code of conduct](https://github.com/ts-3156/twitterscraper-ruby/blob/master/CODE_OF_CONDUCT.md).
|
38
152
|
|
39
153
|
|
40
154
|
## License
|
41
155
|
|
42
156
|
The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
|
43
157
|
|
158
|
+
|
44
159
|
## Code of Conduct
|
45
160
|
|
46
|
-
Everyone interacting in the
|
161
|
+
Everyone interacting in the twitterscraper-ruby project's codebases, issue trackers, chat rooms and mailing lists is expected to follow the [code of conduct](https://github.com/ts-3156/twitterscraper-ruby/blob/master/CODE_OF_CONDUCT.md).
|
data/lib/twitterscraper.rb
CHANGED
@@ -9,10 +9,9 @@ require 'version'
|
|
9
9
|
|
10
10
|
module Twitterscraper
|
11
11
|
class Error < StandardError; end
|
12
|
-
# Your code goes here...
|
13
12
|
|
14
13
|
def self.logger
|
15
|
-
@logger ||= ::Logger.new(STDOUT)
|
14
|
+
@logger ||= ::Logger.new(STDOUT, level: ::Logger::INFO)
|
16
15
|
end
|
17
16
|
|
18
17
|
def self.logger=(logger)
|
data/lib/twitterscraper/cli.rb
CHANGED
@@ -8,17 +8,24 @@ module Twitterscraper
|
|
8
8
|
class Cli
|
9
9
|
def parse
|
10
10
|
@options = parse_options(ARGV)
|
11
|
+
initialize_logger
|
11
12
|
end
|
12
13
|
|
13
14
|
def run
|
14
|
-
|
15
|
-
|
16
|
-
tweets = client.query_tweets(options['query'], limit: limit, start_date: options['start_date'], end_date: options['end_date'])
|
17
|
-
File.write('tweets.json', generate_json(tweets))
|
18
|
-
end
|
15
|
+
print_help || return if print_help?
|
16
|
+
print_version || return if print_version?
|
19
17
|
|
20
|
-
|
21
|
-
|
18
|
+
query_options = {
|
19
|
+
start_date: options['start_date'],
|
20
|
+
end_date: options['end_date'],
|
21
|
+
lang: options['lang'],
|
22
|
+
limit: options['limit'],
|
23
|
+
threads: options['threads'],
|
24
|
+
proxy: options['proxy']
|
25
|
+
}
|
26
|
+
client = Twitterscraper::Client.new
|
27
|
+
tweets = client.query_tweets(options['query'], query_options)
|
28
|
+
File.write(options['output'], generate_json(tweets)) unless tweets.empty?
|
22
29
|
end
|
23
30
|
|
24
31
|
def generate_json(tweets)
|
@@ -29,15 +36,57 @@ module Twitterscraper
|
|
29
36
|
end
|
30
37
|
end
|
31
38
|
|
39
|
+
def options
|
40
|
+
@options
|
41
|
+
end
|
42
|
+
|
32
43
|
def parse_options(argv)
|
33
|
-
argv.getopts(
|
44
|
+
options = argv.getopts(
|
34
45
|
'h',
|
46
|
+
'help',
|
47
|
+
'v',
|
48
|
+
'version',
|
35
49
|
'query:',
|
36
|
-
'limit:',
|
37
50
|
'start_date:',
|
38
51
|
'end_date:',
|
52
|
+
'lang:',
|
53
|
+
'limit:',
|
54
|
+
'threads:',
|
55
|
+
'output:',
|
56
|
+
'proxy',
|
39
57
|
'pretty',
|
58
|
+
'verbose',
|
40
59
|
)
|
60
|
+
|
61
|
+
options['lang'] ||= ''
|
62
|
+
options['limit'] = (options['limit'] || 100).to_i
|
63
|
+
options['threads'] = (options['threads'] || 2).to_i
|
64
|
+
options['output'] ||= 'tweets.json'
|
65
|
+
|
66
|
+
options
|
67
|
+
end
|
68
|
+
|
69
|
+
def initialize_logger
|
70
|
+
Twitterscraper.logger.level = ::Logger::DEBUG if options['verbose']
|
71
|
+
end
|
72
|
+
|
73
|
+
def print_help?
|
74
|
+
options['h'] || options['help']
|
75
|
+
end
|
76
|
+
|
77
|
+
def print_help
|
78
|
+
puts <<~'SHELL'
|
79
|
+
Usage:
|
80
|
+
twitterscraper --query KEYWORD --limit 100 --threads 10 --start_date 2020-07-01 --end_date 2020-07-10 --lang ja --proxy --output output.json
|
81
|
+
SHELL
|
82
|
+
end
|
83
|
+
|
84
|
+
def print_version?
|
85
|
+
options['v'] || options['version']
|
86
|
+
end
|
87
|
+
|
88
|
+
def print_version
|
89
|
+
puts "twitterscraper-#{Twitterscraper::VERSION}"
|
41
90
|
end
|
42
91
|
end
|
43
92
|
end
|
data/lib/twitterscraper/http.rb
CHANGED
data/lib/twitterscraper/proxy.rb
CHANGED
@@ -6,9 +6,9 @@ module Twitterscraper
|
|
6
6
|
class RetryExhausted < StandardError
|
7
7
|
end
|
8
8
|
|
9
|
-
class
|
10
|
-
def initialize
|
11
|
-
@items =
|
9
|
+
class Pool
|
10
|
+
def initialize
|
11
|
+
@items = Proxy.get_proxies
|
12
12
|
@cur_index = 0
|
13
13
|
end
|
14
14
|
|
@@ -17,7 +17,9 @@ module Twitterscraper
|
|
17
17
|
reload
|
18
18
|
end
|
19
19
|
@cur_index += 1
|
20
|
-
@items[@cur_index - 1]
|
20
|
+
item = @items[@cur_index - 1]
|
21
|
+
Twitterscraper.logger.info("Using proxy #{item}")
|
22
|
+
item
|
21
23
|
end
|
22
24
|
|
23
25
|
def size
|
@@ -27,9 +29,8 @@ module Twitterscraper
|
|
27
29
|
private
|
28
30
|
|
29
31
|
def reload
|
30
|
-
@items = Proxy.get_proxies
|
32
|
+
@items = Proxy.get_proxies
|
31
33
|
@cur_index = 0
|
32
|
-
Twitterscraper.logger.debug "Reload #{proxies.size} proxies"
|
33
34
|
end
|
34
35
|
end
|
35
36
|
|
@@ -44,13 +45,14 @@ module Twitterscraper
|
|
44
45
|
|
45
46
|
table.xpath('tbody/tr').each do |tr|
|
46
47
|
cells = tr.xpath('td')
|
47
|
-
ip, port, https = [0, 1, 6].map { |i| cells[i].text.strip }
|
48
|
+
ip, port, anonymity, https = [0, 1, 4, 6].map { |i| cells[i].text.strip }
|
49
|
+
next unless ['elite proxy', 'anonymous'].include?(anonymity)
|
48
50
|
next if https == 'no'
|
49
51
|
proxies << ip + ':' + port
|
50
52
|
end
|
51
53
|
|
52
54
|
Twitterscraper.logger.debug "Fetch #{proxies.size} proxies"
|
53
|
-
|
55
|
+
proxies.shuffle
|
54
56
|
rescue => e
|
55
57
|
if (retries -= 1) > 0
|
56
58
|
retry
|
data/lib/twitterscraper/query.rb
CHANGED
@@ -1,7 +1,10 @@
|
|
1
|
+
require 'resolv-replace'
|
1
2
|
require 'net/http'
|
2
3
|
require 'nokogiri'
|
3
4
|
require 'date'
|
4
5
|
require 'json'
|
6
|
+
require 'erb'
|
7
|
+
require 'parallel'
|
5
8
|
|
6
9
|
module Twitterscraper
|
7
10
|
module Query
|
@@ -14,7 +17,6 @@ module Twitterscraper
|
|
14
17
|
'Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16',
|
15
18
|
'Mozilla/5.0 (Windows NT 5.2; RW; rv:7.0a1) Gecko/20091211 SeaMonkey/9.23a1pre',
|
16
19
|
]
|
17
|
-
USER_AGENT = USER_AGENT_LIST.sample
|
18
20
|
|
19
21
|
INIT_URL = 'https://twitter.com/search?f=tweets&vertical=default&q=__QUERY__&l=__LANG__'
|
20
22
|
RELOAD_URL = 'https://twitter.com/i/search/timeline?f=tweets&vertical=' +
|
@@ -40,7 +42,8 @@ module Twitterscraper
|
|
40
42
|
end
|
41
43
|
end
|
42
44
|
|
43
|
-
def get_single_page(url, headers, proxies, timeout =
|
45
|
+
def get_single_page(url, headers, proxies, timeout = 6, retries = 30)
|
46
|
+
return nil if stop_requested?
|
44
47
|
Twitterscraper::Http.get(url, headers, proxies.sample, timeout)
|
45
48
|
rescue => e
|
46
49
|
logger.debug "query_single_page: #{e.inspect}"
|
@@ -53,26 +56,30 @@ module Twitterscraper
|
|
53
56
|
end
|
54
57
|
|
55
58
|
def parse_single_page(text, html = true)
|
59
|
+
return [nil, nil] if text.nil? || text == ''
|
60
|
+
|
56
61
|
if html
|
57
62
|
json_resp = nil
|
58
63
|
items_html = text
|
59
64
|
else
|
60
65
|
json_resp = JSON.parse(text)
|
61
66
|
items_html = json_resp['items_html'] || ''
|
62
|
-
logger.
|
67
|
+
logger.warn json_resp['message'] if json_resp['message'] # Sorry, you are rate limited.
|
63
68
|
end
|
64
69
|
|
65
70
|
[items_html, json_resp]
|
66
71
|
end
|
67
72
|
|
68
73
|
def query_single_page(query, lang, pos, from_user = false, headers: [], proxies: [])
|
69
|
-
query = query.gsub(' ', '%20').gsub('#', '%23').gsub(':', '%3A').gsub('&', '%26')
|
70
74
|
logger.info("Querying #{query}")
|
75
|
+
query = ERB::Util.url_encode(query)
|
71
76
|
|
72
77
|
url = build_query_url(query, lang, pos, from_user)
|
73
78
|
logger.debug("Scraping tweets from #{url}")
|
74
79
|
|
75
80
|
response = get_single_page(url, headers, proxies)
|
81
|
+
return [], nil if response.nil?
|
82
|
+
|
76
83
|
html, json_resp = parse_single_page(response, pos.nil?)
|
77
84
|
|
78
85
|
tweets = Tweet.from_html(html)
|
@@ -90,51 +97,112 @@ module Twitterscraper
|
|
90
97
|
end
|
91
98
|
end
|
92
99
|
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
if
|
97
|
-
raise 'Please specify
|
98
|
-
|
99
|
-
|
100
|
+
OLDEST_DATE = Date.parse('2006-03-21')
|
101
|
+
|
102
|
+
def validate_options!(query, start_date:, end_date:, lang:, limit:, threads:, proxy:)
|
103
|
+
if query.nil? || query == ''
|
104
|
+
raise 'Please specify a search query.'
|
105
|
+
end
|
106
|
+
|
107
|
+
if ERB::Util.url_encode(query).length >= 500
|
108
|
+
raise ':query must be a UTF-8, URL-encoded search query of 500 characters maximum, including operators.'
|
109
|
+
end
|
110
|
+
|
111
|
+
if start_date && end_date
|
112
|
+
if start_date == end_date
|
113
|
+
raise 'Please specify different values for :start_date and :end_date.'
|
114
|
+
elsif start_date > end_date
|
115
|
+
raise ':start_date must occur before :end_date.'
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
if start_date
|
120
|
+
if start_date < OLDEST_DATE
|
121
|
+
raise ":start_date must be greater than or equal to #{OLDEST_DATE}"
|
122
|
+
end
|
100
123
|
end
|
101
124
|
|
102
|
-
|
125
|
+
if end_date
|
126
|
+
today = Date.today
|
127
|
+
if end_date > Date.today
|
128
|
+
raise ":end_date must be less than or equal to today(#{today})"
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
103
132
|
|
133
|
+
def build_queries(query, start_date, end_date)
|
134
|
+
if start_date && end_date
|
135
|
+
date_range = start_date.upto(end_date - 1)
|
136
|
+
date_range.map { |date| query + " since:#{date} until:#{date + 1}" }
|
137
|
+
elsif start_date
|
138
|
+
[query + " since:#{start_date}"]
|
139
|
+
elsif end_date
|
140
|
+
[query + " until:#{end_date}"]
|
141
|
+
else
|
142
|
+
[query]
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
def main_loop(query, lang, limit, headers, proxies)
|
104
147
|
pos = nil
|
105
|
-
all_tweets = []
|
106
148
|
|
107
|
-
|
149
|
+
while true
|
150
|
+
new_tweets, new_pos = query_single_page(query, lang, pos, headers: headers, proxies: proxies)
|
151
|
+
unless new_tweets.empty?
|
152
|
+
@mutex.synchronize {
|
153
|
+
@all_tweets.concat(new_tweets)
|
154
|
+
@all_tweets.uniq! { |t| t.tweet_id }
|
155
|
+
}
|
156
|
+
end
|
157
|
+
logger.info("Got #{new_tweets.size} tweets (total #{@all_tweets.size})")
|
108
158
|
|
109
|
-
|
110
|
-
|
159
|
+
break unless new_pos
|
160
|
+
break if @all_tweets.size >= limit
|
111
161
|
|
112
|
-
|
113
|
-
|
162
|
+
pos = new_pos
|
163
|
+
end
|
114
164
|
|
115
|
-
|
165
|
+
if @all_tweets.size >= limit
|
166
|
+
logger.info("Limit reached #{@all_tweets.size}")
|
167
|
+
@stop_requested = true
|
168
|
+
end
|
169
|
+
end
|
116
170
|
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
all_tweets.concat(new_tweets)
|
121
|
-
all_tweets.uniq! { |t| t.tweet_id }
|
122
|
-
end
|
123
|
-
logger.info("Got #{new_tweets.size} tweets (total #{all_tweets.size})")
|
171
|
+
def stop_requested?
|
172
|
+
@stop_requested
|
173
|
+
end
|
124
174
|
|
125
|
-
|
126
|
-
|
175
|
+
def query_tweets(query, start_date: nil, end_date: nil, lang: '', limit: 100, threads: 2, proxy: false)
|
176
|
+
start_date = Date.parse(start_date) if start_date && start_date.is_a?(String)
|
177
|
+
end_date = Date.parse(end_date) if end_date && end_date.is_a?(String)
|
178
|
+
queries = build_queries(query, start_date, end_date)
|
179
|
+
threads = queries.size if threads > queries.size
|
180
|
+
proxies = proxy ? Twitterscraper::Proxy::Pool.new : []
|
127
181
|
|
128
|
-
|
129
|
-
|
182
|
+
validate_options!(queries[0], start_date: start_date, end_date: end_date, lang: lang, limit: limit, threads: threads, proxy: proxy)
|
183
|
+
|
184
|
+
logger.info("The number of threads #{threads}")
|
185
|
+
|
186
|
+
headers = {'User-Agent': USER_AGENT_LIST.sample, 'X-Requested-With': 'XMLHttpRequest'}
|
187
|
+
logger.info("Headers #{headers}")
|
130
188
|
|
131
|
-
|
132
|
-
|
133
|
-
|
189
|
+
@all_tweets = []
|
190
|
+
@mutex = Mutex.new
|
191
|
+
@stop_requested = false
|
192
|
+
|
193
|
+
if threads > 1
|
194
|
+
Parallel.each(queries, in_threads: threads) do |query|
|
195
|
+
main_loop(query, lang, limit, headers, proxies)
|
196
|
+
raise Parallel::Break if stop_requested?
|
197
|
+
end
|
198
|
+
else
|
199
|
+
queries.each do |query|
|
200
|
+
main_loop(query, lang, limit, headers, proxies)
|
201
|
+
break if stop_requested?
|
134
202
|
end
|
135
203
|
end
|
136
204
|
|
137
|
-
all_tweets
|
205
|
+
@all_tweets.sort_by { |tweet| -tweet.created_at.to_i }
|
138
206
|
end
|
139
207
|
end
|
140
208
|
end
|
data/lib/twitterscraper/tweet.rb
CHANGED
@@ -2,7 +2,28 @@ require 'time'
|
|
2
2
|
|
3
3
|
module Twitterscraper
|
4
4
|
class Tweet
|
5
|
-
KEYS = [
|
5
|
+
KEYS = [
|
6
|
+
:screen_name,
|
7
|
+
:name,
|
8
|
+
:user_id,
|
9
|
+
:tweet_id,
|
10
|
+
:text,
|
11
|
+
:links,
|
12
|
+
:hashtags,
|
13
|
+
:image_urls,
|
14
|
+
:video_url,
|
15
|
+
:has_media,
|
16
|
+
:likes,
|
17
|
+
:retweets,
|
18
|
+
:replies,
|
19
|
+
:is_replied,
|
20
|
+
:is_reply_to,
|
21
|
+
:parent_tweet_id,
|
22
|
+
:reply_to_users,
|
23
|
+
:tweet_url,
|
24
|
+
:timestamp,
|
25
|
+
:created_at,
|
26
|
+
]
|
6
27
|
attr_reader *KEYS
|
7
28
|
|
8
29
|
def initialize(attrs)
|
@@ -11,10 +32,14 @@ module Twitterscraper
|
|
11
32
|
end
|
12
33
|
end
|
13
34
|
|
14
|
-
def
|
35
|
+
def attrs
|
15
36
|
KEYS.map do |key|
|
16
37
|
[key, send(key)]
|
17
|
-
end.to_h
|
38
|
+
end.to_h
|
39
|
+
end
|
40
|
+
|
41
|
+
def to_json(options = {})
|
42
|
+
attrs.to_json
|
18
43
|
end
|
19
44
|
|
20
45
|
class << self
|
@@ -31,15 +56,51 @@ module Twitterscraper
|
|
31
56
|
|
32
57
|
def from_tweet_html(html)
|
33
58
|
inner_html = Nokogiri::HTML(html.inner_html)
|
59
|
+
tweet_id = html.attr('data-tweet-id').to_i
|
60
|
+
text = inner_html.xpath("//div[@class[contains(., 'js-tweet-text-container')]]/p[@class[contains(., 'js-tweet-text')]]").first.text
|
61
|
+
links = inner_html.xpath("//a[@class[contains(., 'twitter-timeline-link')]]").map { |elem| elem.attr('data-expanded-url') }.select { |link| link && !link.include?('pic.twitter') }
|
62
|
+
image_urls = inner_html.xpath("//div[@class[contains(., 'AdaptiveMedia-photoContainer')]]").map { |elem| elem.attr('data-image-url') }
|
63
|
+
video_url = inner_html.xpath("//div[@class[contains(., 'PlayableMedia-container')]]/a").map { |elem| elem.attr('href') }[0]
|
64
|
+
has_media = !image_urls.empty? || (video_url && !video_url.empty?)
|
65
|
+
|
66
|
+
actions = inner_html.xpath("//div[@class[contains(., 'ProfileTweet-actionCountList')]]")
|
67
|
+
likes = actions.xpath("//span[@class[contains(., 'ProfileTweet-action--favorite')]]/span[@class[contains(., 'ProfileTweet-actionCount')]]").first.attr('data-tweet-stat-count').to_i || 0
|
68
|
+
retweets = actions.xpath("//span[@class[contains(., 'ProfileTweet-action--retweet')]]/span[@class[contains(., 'ProfileTweet-actionCount')]]").first.attr('data-tweet-stat-count').to_i || 0
|
69
|
+
replies = actions.xpath("//span[@class[contains(., 'ProfileTweet-action--reply u-hiddenVisually')]]/span[@class[contains(., 'ProfileTweet-actionCount')]]").first.attr('data-tweet-stat-count').to_i || 0
|
70
|
+
is_replied = replies != 0
|
71
|
+
|
72
|
+
parent_tweet_id = inner_html.xpath('//*[@data-conversation-id]').first.attr('data-conversation-id').to_i
|
73
|
+
if tweet_id == parent_tweet_id
|
74
|
+
is_reply_to = false
|
75
|
+
parent_tweet_id = nil
|
76
|
+
reply_to_users = []
|
77
|
+
else
|
78
|
+
is_reply_to = true
|
79
|
+
reply_to_users = inner_html.xpath("//div[@class[contains(., 'ReplyingToContextBelowAuthor')]]/a").map { |user| {screen_name: user.text.delete_prefix('@'), user_id: user.attr('data-user-id')} }
|
80
|
+
end
|
81
|
+
|
34
82
|
timestamp = inner_html.xpath("//span[@class[contains(., 'js-short-timestamp')]]").first.attr('data-time').to_i
|
35
83
|
new(
|
36
84
|
screen_name: html.attr('data-screen-name'),
|
37
85
|
name: html.attr('data-name'),
|
38
86
|
user_id: html.attr('data-user-id').to_i,
|
39
|
-
tweet_id:
|
87
|
+
tweet_id: tweet_id,
|
88
|
+
text: text,
|
89
|
+
links: links,
|
90
|
+
hashtags: text.scan(/#\w+/).map { |tag| tag.delete_prefix('#') },
|
91
|
+
image_urls: image_urls,
|
92
|
+
video_url: video_url,
|
93
|
+
has_media: has_media,
|
94
|
+
likes: likes,
|
95
|
+
retweets: retweets,
|
96
|
+
replies: replies,
|
97
|
+
is_replied: is_replied,
|
98
|
+
is_reply_to: is_reply_to,
|
99
|
+
parent_tweet_id: parent_tweet_id,
|
100
|
+
reply_to_users: reply_to_users,
|
40
101
|
tweet_url: 'https://twitter.com' + html.attr('data-permalink-path'),
|
102
|
+
timestamp: timestamp,
|
41
103
|
created_at: Time.at(timestamp, in: '+00:00'),
|
42
|
-
text: inner_html.xpath("//div[@class[contains(., 'js-tweet-text-container')]]/p[@class[contains(., 'js-tweet-text')]]").first.text,
|
43
104
|
)
|
44
105
|
end
|
45
106
|
end
|
data/lib/version.rb
CHANGED
data/twitterscraper-ruby.gemspec
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: twitterscraper-ruby
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.9.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- ts-3156
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-07-
|
11
|
+
date: 2020-07-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -24,6 +24,20 @@ dependencies:
|
|
24
24
|
- - ">="
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: parallel
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
27
41
|
description: A gem to scrape Tweets
|
28
42
|
email:
|
29
43
|
- ts_3156@yahoo.co.jp
|