twitterscraper-ruby 0.9.0 → 0.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.circleci/config.yml +31 -0
- data/.gitignore +1 -1
- data/.rspec +2 -0
- data/Gemfile +1 -0
- data/Gemfile.lock +16 -1
- data/README.md +10 -6
- data/bin/twitterscraper +1 -1
- data/lib/twitterscraper.rb +2 -0
- data/lib/twitterscraper/cache.rb +69 -0
- data/lib/twitterscraper/cli.rb +30 -6
- data/lib/twitterscraper/client.rb +13 -0
- data/lib/twitterscraper/proxy.rb +5 -4
- data/lib/twitterscraper/query.rb +64 -27
- data/lib/twitterscraper/template.rb +48 -0
- data/lib/twitterscraper/tweet.rb +18 -3
- data/lib/version.rb +1 -1
- metadata +6 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: cf902c947e866cc99e79fbb9f8a51c829accd44aed03ef7657562bf41932c73d
|
4
|
+
data.tar.gz: 1bc5a0698a17b244ee9228d7728767dd00218179a5a49e0852a74cc722322ef0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 629de8698af1391c210b496e9aadb51ad5f9d7157b1be5d0aa669ae821671e2b5624ba51083fb14b61f93618ff3e90aea1ac0eccb6ea00360fac48a2dfc436c7
|
7
|
+
data.tar.gz: 3f3706bee5f2a92a2addae034201e2e8cee3fef43efdc323be963cbaf1b94c31c53aa49a19e58a068498722dfe07e9796e097fb04364a9afda56d06132e6b935
|
@@ -0,0 +1,31 @@
|
|
1
|
+
version: 2.1
|
2
|
+
orbs:
|
3
|
+
ruby: circleci/ruby@0.1.2
|
4
|
+
|
5
|
+
jobs:
|
6
|
+
build:
|
7
|
+
docker:
|
8
|
+
- image: circleci/ruby:2.6.4-stretch-node
|
9
|
+
environment:
|
10
|
+
BUNDLER_VERSION: 2.1.4
|
11
|
+
executor: ruby/default
|
12
|
+
steps:
|
13
|
+
- checkout
|
14
|
+
- run:
|
15
|
+
name: Update bundler
|
16
|
+
command: gem update bundler
|
17
|
+
- run:
|
18
|
+
name: Which bundler?
|
19
|
+
command: bundle -v
|
20
|
+
- restore_cache:
|
21
|
+
keys:
|
22
|
+
- gem-cache-v1-{{ arch }}-{{ .Branch }}-{{ checksum "Gemfile.lock" }}
|
23
|
+
- gem-cache-v1-{{ arch }}-{{ .Branch }}
|
24
|
+
- gem-cache-v1
|
25
|
+
- run: bundle install --path vendor/bundle
|
26
|
+
- run: bundle clean
|
27
|
+
- save_cache:
|
28
|
+
key: gem-cache-v1-{{ arch }}-{{ .Branch }}-{{ checksum "Gemfile.lock" }}
|
29
|
+
paths:
|
30
|
+
- vendor/bundle
|
31
|
+
- run: bundle exec rspec
|
data/.gitignore
CHANGED
data/.rspec
ADDED
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,19 +1,33 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
twitterscraper-ruby (0.
|
4
|
+
twitterscraper-ruby (0.14.0)
|
5
5
|
nokogiri
|
6
6
|
parallel
|
7
7
|
|
8
8
|
GEM
|
9
9
|
remote: https://rubygems.org/
|
10
10
|
specs:
|
11
|
+
diff-lcs (1.4.4)
|
11
12
|
mini_portile2 (2.4.0)
|
12
13
|
minitest (5.14.1)
|
13
14
|
nokogiri (1.10.10)
|
14
15
|
mini_portile2 (~> 2.4.0)
|
15
16
|
parallel (1.19.2)
|
16
17
|
rake (12.3.3)
|
18
|
+
rspec (3.9.0)
|
19
|
+
rspec-core (~> 3.9.0)
|
20
|
+
rspec-expectations (~> 3.9.0)
|
21
|
+
rspec-mocks (~> 3.9.0)
|
22
|
+
rspec-core (3.9.2)
|
23
|
+
rspec-support (~> 3.9.3)
|
24
|
+
rspec-expectations (3.9.2)
|
25
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
26
|
+
rspec-support (~> 3.9.0)
|
27
|
+
rspec-mocks (3.9.1)
|
28
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
29
|
+
rspec-support (~> 3.9.0)
|
30
|
+
rspec-support (3.9.3)
|
17
31
|
|
18
32
|
PLATFORMS
|
19
33
|
ruby
|
@@ -21,6 +35,7 @@ PLATFORMS
|
|
21
35
|
DEPENDENCIES
|
22
36
|
minitest (~> 5.0)
|
23
37
|
rake (~> 12.0)
|
38
|
+
rspec
|
24
39
|
twitterscraper-ruby!
|
25
40
|
|
26
41
|
BUNDLED WITH
|
data/README.md
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
# twitterscraper-ruby
|
2
2
|
|
3
|
+
[](https://circleci.com/gh/ts-3156/twitterscraper-ruby)
|
3
4
|
[](https://badge.fury.io/rb/twitterscraper-ruby)
|
4
5
|
|
5
6
|
A gem to scrape https://twitter.com/search. This gem is inspired by [taspinar/twitterscraper](https://github.com/taspinar/twitterscraper).
|
@@ -33,7 +34,7 @@ Command-line interface:
|
|
33
34
|
|
34
35
|
```shell script
|
35
36
|
$ twitterscraper --query KEYWORD --start_date 2020-06-01 --end_date 2020-06-30 --lang ja \
|
36
|
-
--limit 100 --threads 10 --
|
37
|
+
--limit 100 --threads 10 --output output.json
|
37
38
|
```
|
38
39
|
|
39
40
|
From Within Ruby:
|
@@ -47,10 +48,9 @@ options = {
|
|
47
48
|
lang: 'ja',
|
48
49
|
limit: 100,
|
49
50
|
threads: 10,
|
50
|
-
proxy: true
|
51
51
|
}
|
52
52
|
|
53
|
-
client = Twitterscraper::Client.new
|
53
|
+
client = Twitterscraper::Client.new(cache: true, proxy: true)
|
54
54
|
tweets = client.query_tweets(KEYWORD, options)
|
55
55
|
|
56
56
|
tweets.each do |tweet|
|
@@ -137,13 +137,17 @@ $ cat tweets.json | jq . | less
|
|
137
137
|
| ------------- | ------------- | ------------- |
|
138
138
|
| `-h`, `--help` | This option displays a summary of twitterscraper. | |
|
139
139
|
| `--query` | Specify a keyword used during the search. | |
|
140
|
-
| `--start_date` |
|
141
|
-
| `--end_date` |
|
140
|
+
| `--start_date` | Used as "since:yyyy-mm-dd for your query. This means "since the date". | |
|
141
|
+
| `--end_date` | Used as "until:yyyy-mm-dd for your query. This means "before the date". | |
|
142
142
|
| `--lang` | Retrieve tweets written in a specific language. | |
|
143
143
|
| `--limit` | Stop scraping when *at least* the number of tweets indicated with --limit is scraped. | 100 |
|
144
|
+
| `--order` | Sort order of the results. | desc |
|
144
145
|
| `--threads` | Set the number of threads twitterscraper-ruby should initiate while scraping for your query. | 2 |
|
145
|
-
| `--proxy` | Scrape https://twitter.com/search via proxies. |
|
146
|
+
| `--proxy` | Scrape https://twitter.com/search via proxies. | true |
|
147
|
+
| `--cache` | Enable caching. | true |
|
148
|
+
| `--format` | The format of the output. | json |
|
146
149
|
| `--output` | The name of the output file. | tweets.json |
|
150
|
+
| `--verbose` | Print debug messages. | tweets.json |
|
147
151
|
|
148
152
|
|
149
153
|
## Contributing
|
data/bin/twitterscraper
CHANGED
data/lib/twitterscraper.rb
CHANGED
@@ -2,9 +2,11 @@ require 'twitterscraper/logger'
|
|
2
2
|
require 'twitterscraper/proxy'
|
3
3
|
require 'twitterscraper/http'
|
4
4
|
require 'twitterscraper/lang'
|
5
|
+
require 'twitterscraper/cache'
|
5
6
|
require 'twitterscraper/query'
|
6
7
|
require 'twitterscraper/client'
|
7
8
|
require 'twitterscraper/tweet'
|
9
|
+
require 'twitterscraper/template'
|
8
10
|
require 'version'
|
9
11
|
|
10
12
|
module Twitterscraper
|
@@ -0,0 +1,69 @@
|
|
1
|
+
require 'base64'
|
2
|
+
require 'digest/md5'
|
3
|
+
|
4
|
+
module Twitterscraper
|
5
|
+
class Cache
|
6
|
+
def initialize()
|
7
|
+
@ttl = 3600 # 1 hour
|
8
|
+
@dir = 'cache'
|
9
|
+
Dir.mkdir(@dir) unless File.exist?(@dir)
|
10
|
+
end
|
11
|
+
|
12
|
+
def read(key)
|
13
|
+
key = cache_key(key)
|
14
|
+
file = File.join(@dir, key)
|
15
|
+
entry = Entry.from_json(File.read(file))
|
16
|
+
entry.value if entry.time > Time.now - @ttl
|
17
|
+
rescue Errno::ENOENT => e
|
18
|
+
nil
|
19
|
+
end
|
20
|
+
|
21
|
+
def write(key, value)
|
22
|
+
key = cache_key(key)
|
23
|
+
entry = Entry.new(key, value, Time.now)
|
24
|
+
file = File.join(@dir, key)
|
25
|
+
File.write(file, entry.to_json)
|
26
|
+
end
|
27
|
+
|
28
|
+
def fetch(key, &block)
|
29
|
+
if (value = read(key))
|
30
|
+
value
|
31
|
+
else
|
32
|
+
yield.tap { |v| write(key, v) }
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def cache_key(key)
|
37
|
+
value = key.gsub(':', '%3A').gsub('/', '%2F').gsub('?', '%3F').gsub('=', '%3D').gsub('&', '%26')
|
38
|
+
value = Digest::MD5.hexdigest(value) if value.length >= 100
|
39
|
+
value
|
40
|
+
end
|
41
|
+
|
42
|
+
class Entry < Hash
|
43
|
+
attr_reader :key, :value, :time
|
44
|
+
|
45
|
+
def initialize(key, value, time)
|
46
|
+
@key = key
|
47
|
+
@value = value
|
48
|
+
@time = time
|
49
|
+
end
|
50
|
+
|
51
|
+
def attrs
|
52
|
+
{key: @key, value: @value, time: @time}
|
53
|
+
end
|
54
|
+
|
55
|
+
def to_json
|
56
|
+
hash = attrs
|
57
|
+
hash[:value] = Base64.encode64(hash[:value])
|
58
|
+
hash.to_json
|
59
|
+
end
|
60
|
+
|
61
|
+
class << self
|
62
|
+
def from_json(text)
|
63
|
+
json = JSON.parse(text)
|
64
|
+
new(json['key'], Base64.decode64(json['value']), Time.parse(json['time']))
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
data/lib/twitterscraper/cli.rb
CHANGED
@@ -20,12 +20,25 @@ module Twitterscraper
|
|
20
20
|
end_date: options['end_date'],
|
21
21
|
lang: options['lang'],
|
22
22
|
limit: options['limit'],
|
23
|
+
daily_limit: options['daily_limit'],
|
24
|
+
order: options['order'],
|
23
25
|
threads: options['threads'],
|
24
|
-
proxy: options['proxy']
|
25
26
|
}
|
26
|
-
client = Twitterscraper::Client.new
|
27
|
+
client = Twitterscraper::Client.new(cache: options['cache'], proxy: options['proxy'])
|
27
28
|
tweets = client.query_tweets(options['query'], query_options)
|
28
|
-
|
29
|
+
export(tweets) unless tweets.empty?
|
30
|
+
end
|
31
|
+
|
32
|
+
def export(tweets)
|
33
|
+
write_json = lambda { File.write(options['output'], generate_json(tweets)) }
|
34
|
+
|
35
|
+
if options['format'] == 'json'
|
36
|
+
write_json.call
|
37
|
+
elsif options['format'] == 'html'
|
38
|
+
File.write('tweets.html', Template.tweets_embedded_html(tweets))
|
39
|
+
else
|
40
|
+
write_json.call
|
41
|
+
end
|
29
42
|
end
|
30
43
|
|
31
44
|
def generate_json(tweets)
|
@@ -51,17 +64,28 @@ module Twitterscraper
|
|
51
64
|
'end_date:',
|
52
65
|
'lang:',
|
53
66
|
'limit:',
|
67
|
+
'daily_limit:',
|
68
|
+
'order:',
|
54
69
|
'threads:',
|
55
70
|
'output:',
|
56
|
-
'
|
71
|
+
'format:',
|
72
|
+
'cache:',
|
73
|
+
'proxy:',
|
57
74
|
'pretty',
|
58
75
|
'verbose',
|
59
76
|
)
|
60
77
|
|
78
|
+
options['start_date'] = Query::OLDEST_DATE if options['start_date'] == 'oldest'
|
61
79
|
options['lang'] ||= ''
|
62
80
|
options['limit'] = (options['limit'] || 100).to_i
|
81
|
+
options['daily_limit'] = options['daily_limit'].to_i if options['daily_limit']
|
63
82
|
options['threads'] = (options['threads'] || 2).to_i
|
64
|
-
options['
|
83
|
+
options['format'] ||= 'json'
|
84
|
+
options['order'] ||= 'desc'
|
85
|
+
options['output'] ||= "tweets.#{options['format']}"
|
86
|
+
|
87
|
+
options['cache'] = options['cache'] != 'false'
|
88
|
+
options['proxy'] = options['proxy'] != 'false'
|
65
89
|
|
66
90
|
options
|
67
91
|
end
|
@@ -86,7 +110,7 @@ module Twitterscraper
|
|
86
110
|
end
|
87
111
|
|
88
112
|
def print_version
|
89
|
-
puts "twitterscraper-#{
|
113
|
+
puts "twitterscraper-#{VERSION}"
|
90
114
|
end
|
91
115
|
end
|
92
116
|
end
|
data/lib/twitterscraper/proxy.rb
CHANGED
@@ -17,15 +17,17 @@ module Twitterscraper
|
|
17
17
|
reload
|
18
18
|
end
|
19
19
|
@cur_index += 1
|
20
|
-
|
21
|
-
Twitterscraper.logger.info("Using proxy #{item}")
|
22
|
-
item
|
20
|
+
@items[@cur_index - 1]
|
23
21
|
end
|
24
22
|
|
25
23
|
def size
|
26
24
|
@items.size
|
27
25
|
end
|
28
26
|
|
27
|
+
def empty?
|
28
|
+
@items.empty?
|
29
|
+
end
|
30
|
+
|
29
31
|
private
|
30
32
|
|
31
33
|
def reload
|
@@ -51,7 +53,6 @@ module Twitterscraper
|
|
51
53
|
proxies << ip + ':' + port
|
52
54
|
end
|
53
55
|
|
54
|
-
Twitterscraper.logger.debug "Fetch #{proxies.size} proxies"
|
55
56
|
proxies.shuffle
|
56
57
|
rescue => e
|
57
58
|
if (retries -= 1) > 0
|
data/lib/twitterscraper/query.rb
CHANGED
@@ -44,14 +44,18 @@ module Twitterscraper
|
|
44
44
|
|
45
45
|
def get_single_page(url, headers, proxies, timeout = 6, retries = 30)
|
46
46
|
return nil if stop_requested?
|
47
|
-
|
47
|
+
unless proxies.empty?
|
48
|
+
proxy = proxies.sample
|
49
|
+
logger.info("Using proxy #{proxy}")
|
50
|
+
end
|
51
|
+
Http.get(url, headers, proxy, timeout)
|
48
52
|
rescue => e
|
49
53
|
logger.debug "query_single_page: #{e.inspect}"
|
50
54
|
if (retries -= 1) > 0
|
51
|
-
logger.info
|
55
|
+
logger.info "Retrying... (Attempts left: #{retries - 1})"
|
52
56
|
retry
|
53
57
|
else
|
54
|
-
raise
|
58
|
+
raise Error.new("#{e.inspect} url=#{url}")
|
55
59
|
end
|
56
60
|
end
|
57
61
|
|
@@ -71,14 +75,27 @@ module Twitterscraper
|
|
71
75
|
end
|
72
76
|
|
73
77
|
def query_single_page(query, lang, pos, from_user = false, headers: [], proxies: [])
|
74
|
-
logger.info
|
78
|
+
logger.info "Querying #{query}"
|
75
79
|
query = ERB::Util.url_encode(query)
|
76
80
|
|
77
81
|
url = build_query_url(query, lang, pos, from_user)
|
78
|
-
|
82
|
+
http_request = lambda do
|
83
|
+
logger.debug "Scraping tweets from #{url}"
|
84
|
+
get_single_page(url, headers, proxies)
|
85
|
+
end
|
79
86
|
|
80
|
-
|
81
|
-
|
87
|
+
if cache_enabled?
|
88
|
+
client = Cache.new
|
89
|
+
if (response = client.read(url))
|
90
|
+
logger.debug 'Fetching tweets from cache'
|
91
|
+
else
|
92
|
+
response = http_request.call
|
93
|
+
client.write(url, response) unless stop_requested?
|
94
|
+
end
|
95
|
+
else
|
96
|
+
response = http_request.call
|
97
|
+
end
|
98
|
+
return [], nil if response.nil? || response.empty?
|
82
99
|
|
83
100
|
html, json_resp = parse_single_page(response, pos.nil?)
|
84
101
|
|
@@ -99,33 +116,34 @@ module Twitterscraper
|
|
99
116
|
|
100
117
|
OLDEST_DATE = Date.parse('2006-03-21')
|
101
118
|
|
102
|
-
def validate_options!(
|
119
|
+
def validate_options!(queries, start_date:, end_date:, lang:, limit:, threads:)
|
120
|
+
query = queries[0]
|
103
121
|
if query.nil? || query == ''
|
104
|
-
raise 'Please specify a search query.'
|
122
|
+
raise Error.new('Please specify a search query.')
|
105
123
|
end
|
106
124
|
|
107
125
|
if ERB::Util.url_encode(query).length >= 500
|
108
|
-
raise ':query must be a UTF-8, URL-encoded search query of 500 characters maximum, including operators.'
|
126
|
+
raise Error.new(':query must be a UTF-8, URL-encoded search query of 500 characters maximum, including operators.')
|
109
127
|
end
|
110
128
|
|
111
129
|
if start_date && end_date
|
112
130
|
if start_date == end_date
|
113
|
-
raise 'Please specify different values for :start_date and :end_date.'
|
131
|
+
raise Error.new('Please specify different values for :start_date and :end_date.')
|
114
132
|
elsif start_date > end_date
|
115
|
-
raise ':start_date must occur before :end_date.'
|
133
|
+
raise Error.new(':start_date must occur before :end_date.')
|
116
134
|
end
|
117
135
|
end
|
118
136
|
|
119
137
|
if start_date
|
120
138
|
if start_date < OLDEST_DATE
|
121
|
-
raise ":start_date must be greater than or equal to #{OLDEST_DATE}"
|
139
|
+
raise Error.new(":start_date must be greater than or equal to #{OLDEST_DATE}")
|
122
140
|
end
|
123
141
|
end
|
124
142
|
|
125
143
|
if end_date
|
126
144
|
today = Date.today
|
127
145
|
if end_date > Date.today
|
128
|
-
raise ":end_date must be less than or equal to today(#{today})"
|
146
|
+
raise Error.new(":end_date must be less than or equal to today(#{today})")
|
129
147
|
end
|
130
148
|
end
|
131
149
|
end
|
@@ -143,27 +161,32 @@ module Twitterscraper
|
|
143
161
|
end
|
144
162
|
end
|
145
163
|
|
146
|
-
def main_loop(query, lang, limit, headers, proxies)
|
164
|
+
def main_loop(query, lang, limit, daily_limit, headers, proxies)
|
147
165
|
pos = nil
|
166
|
+
daily_tweets = []
|
148
167
|
|
149
168
|
while true
|
150
169
|
new_tweets, new_pos = query_single_page(query, lang, pos, headers: headers, proxies: proxies)
|
151
170
|
unless new_tweets.empty?
|
171
|
+
daily_tweets.concat(new_tweets)
|
172
|
+
daily_tweets.uniq! { |t| t.tweet_id }
|
173
|
+
|
152
174
|
@mutex.synchronize {
|
153
175
|
@all_tweets.concat(new_tweets)
|
154
176
|
@all_tweets.uniq! { |t| t.tweet_id }
|
155
177
|
}
|
156
178
|
end
|
157
|
-
logger.info
|
179
|
+
logger.info "Got #{new_tweets.size} tweets (total #{@all_tweets.size})"
|
158
180
|
|
159
181
|
break unless new_pos
|
182
|
+
break if daily_limit && daily_tweets.size >= daily_limit
|
160
183
|
break if @all_tweets.size >= limit
|
161
184
|
|
162
185
|
pos = new_pos
|
163
186
|
end
|
164
187
|
|
165
|
-
if @all_tweets.size >= limit
|
166
|
-
logger.
|
188
|
+
if !@stop_requested && @all_tweets.size >= limit
|
189
|
+
logger.warn "The limit you specified has been reached limit=#{limit} tweets=#{@all_tweets.size}"
|
167
190
|
@stop_requested = true
|
168
191
|
end
|
169
192
|
end
|
@@ -172,37 +195,51 @@ module Twitterscraper
|
|
172
195
|
@stop_requested
|
173
196
|
end
|
174
197
|
|
175
|
-
def query_tweets(query, start_date: nil, end_date: nil, lang: '', limit: 100,
|
198
|
+
def query_tweets(query, start_date: nil, end_date: nil, lang: '', limit: 100, daily_limit: nil, order: 'desc', threads: 2)
|
176
199
|
start_date = Date.parse(start_date) if start_date && start_date.is_a?(String)
|
177
200
|
end_date = Date.parse(end_date) if end_date && end_date.is_a?(String)
|
178
201
|
queries = build_queries(query, start_date, end_date)
|
179
|
-
|
180
|
-
|
202
|
+
if threads > queries.size
|
203
|
+
logger.warn 'The maximum number of :threads is the number of dates between :start_date and :end_date.'
|
204
|
+
threads = queries.size
|
205
|
+
end
|
206
|
+
if proxy_enabled?
|
207
|
+
proxies = Proxy::Pool.new
|
208
|
+
logger.debug "Fetch #{proxies.size} proxies"
|
209
|
+
else
|
210
|
+
proxies = []
|
211
|
+
logger.debug 'Proxy disabled'
|
212
|
+
end
|
213
|
+
logger.debug "Cache #{cache_enabled? ? 'enabled' : 'disabled'}"
|
181
214
|
|
182
|
-
validate_options!(queries[0], start_date: start_date, end_date: end_date, lang: lang, limit: limit, threads: threads, proxy: proxy)
|
183
215
|
|
184
|
-
|
216
|
+
validate_options!(queries, start_date: start_date, end_date: end_date, lang: lang, limit: limit, threads: threads)
|
217
|
+
|
218
|
+
logger.info "The number of threads #{threads}"
|
185
219
|
|
186
220
|
headers = {'User-Agent': USER_AGENT_LIST.sample, 'X-Requested-With': 'XMLHttpRequest'}
|
187
|
-
logger.info
|
221
|
+
logger.info "Headers #{headers}"
|
188
222
|
|
189
223
|
@all_tweets = []
|
190
224
|
@mutex = Mutex.new
|
191
225
|
@stop_requested = false
|
192
226
|
|
193
227
|
if threads > 1
|
228
|
+
Thread.abort_on_exception = true
|
229
|
+
logger.debug "Set 'Thread.abort_on_exception' to true"
|
230
|
+
|
194
231
|
Parallel.each(queries, in_threads: threads) do |query|
|
195
|
-
main_loop(query, lang, limit, headers, proxies)
|
232
|
+
main_loop(query, lang, limit, daily_limit, headers, proxies)
|
196
233
|
raise Parallel::Break if stop_requested?
|
197
234
|
end
|
198
235
|
else
|
199
236
|
queries.each do |query|
|
200
|
-
main_loop(query, lang, limit, headers, proxies)
|
237
|
+
main_loop(query, lang, limit, daily_limit, headers, proxies)
|
201
238
|
break if stop_requested?
|
202
239
|
end
|
203
240
|
end
|
204
241
|
|
205
|
-
@all_tweets.sort_by { |tweet| -tweet.created_at.to_i }
|
242
|
+
@all_tweets.sort_by { |tweet| (order == 'desc' ? -1 : 1) * tweet.created_at.to_i }
|
206
243
|
end
|
207
244
|
end
|
208
245
|
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
module Twitterscraper
|
2
|
+
module Template
|
3
|
+
module_function
|
4
|
+
|
5
|
+
def tweets_embedded_html(tweets)
|
6
|
+
tweets_html = tweets.map { |t| EMBED_TWEET_HTML.sub('__TWEET_URL__', t.tweet_url) }
|
7
|
+
EMBED_TWEETS_HTML.sub('__TWEETS__', tweets_html.join)
|
8
|
+
end
|
9
|
+
|
10
|
+
EMBED_TWEET_HTML = <<~'HTML'
|
11
|
+
<blockquote class="twitter-tweet">
|
12
|
+
<a href="__TWEET_URL__"></a>
|
13
|
+
</blockquote>
|
14
|
+
HTML
|
15
|
+
|
16
|
+
EMBED_TWEETS_HTML = <<~'HTML'
|
17
|
+
<html>
|
18
|
+
<head>
|
19
|
+
<style type=text/css>
|
20
|
+
.twitter-tweet {
|
21
|
+
margin: 30px auto 0 auto !important;
|
22
|
+
}
|
23
|
+
</style>
|
24
|
+
<script>
|
25
|
+
window.twttr = (function(d, s, id) {
|
26
|
+
var js, fjs = d.getElementsByTagName(s)[0], t = window.twttr || {};
|
27
|
+
if (d.getElementById(id)) return t;
|
28
|
+
js = d.createElement(s);
|
29
|
+
js.id = id;
|
30
|
+
js.src = "https://platform.twitter.com/widgets.js";
|
31
|
+
fjs.parentNode.insertBefore(js, fjs);
|
32
|
+
|
33
|
+
t._e = [];
|
34
|
+
t.ready = function(f) {
|
35
|
+
t._e.push(f);
|
36
|
+
};
|
37
|
+
|
38
|
+
return t;
|
39
|
+
}(document, "script", "twitter-wjs"));
|
40
|
+
</script>
|
41
|
+
</head>
|
42
|
+
<body>
|
43
|
+
__TWEETS__
|
44
|
+
</body>
|
45
|
+
</html>
|
46
|
+
HTML
|
47
|
+
end
|
48
|
+
end
|
data/lib/twitterscraper/tweet.rb
CHANGED
@@ -43,6 +43,14 @@ module Twitterscraper
|
|
43
43
|
end
|
44
44
|
|
45
45
|
class << self
|
46
|
+
def from_json(text)
|
47
|
+
json = JSON.parse(text)
|
48
|
+
json.map do |tweet|
|
49
|
+
tweet['created_at'] = Time.parse(tweet['created_at'])
|
50
|
+
new(tweet)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
46
54
|
def from_html(text)
|
47
55
|
html = Nokogiri::HTML(text)
|
48
56
|
from_tweets_html(html.xpath("//li[@class[contains(., 'js-stream-item')]]/div[@class[contains(., 'js-stream-tweet')]]"))
|
@@ -51,12 +59,19 @@ module Twitterscraper
|
|
51
59
|
def from_tweets_html(html)
|
52
60
|
html.map do |tweet|
|
53
61
|
from_tweet_html(tweet)
|
54
|
-
end
|
62
|
+
end.compact
|
55
63
|
end
|
56
64
|
|
57
65
|
def from_tweet_html(html)
|
66
|
+
screen_name = html.attr('data-screen-name')
|
67
|
+
tweet_id = html.attr('data-tweet-id')&.to_i
|
68
|
+
|
69
|
+
unless html.to_s.include?('js-tweet-text-container')
|
70
|
+
Twitterscraper.logger.warn "html doesn't include div.js-tweet-text-container url=https://twitter.com/#{screen_name}/status/#{tweet_id}"
|
71
|
+
return nil
|
72
|
+
end
|
73
|
+
|
58
74
|
inner_html = Nokogiri::HTML(html.inner_html)
|
59
|
-
tweet_id = html.attr('data-tweet-id').to_i
|
60
75
|
text = inner_html.xpath("//div[@class[contains(., 'js-tweet-text-container')]]/p[@class[contains(., 'js-tweet-text')]]").first.text
|
61
76
|
links = inner_html.xpath("//a[@class[contains(., 'twitter-timeline-link')]]").map { |elem| elem.attr('data-expanded-url') }.select { |link| link && !link.include?('pic.twitter') }
|
62
77
|
image_urls = inner_html.xpath("//div[@class[contains(., 'AdaptiveMedia-photoContainer')]]").map { |elem| elem.attr('data-image-url') }
|
@@ -81,7 +96,7 @@ module Twitterscraper
|
|
81
96
|
|
82
97
|
timestamp = inner_html.xpath("//span[@class[contains(., 'js-short-timestamp')]]").first.attr('data-time').to_i
|
83
98
|
new(
|
84
|
-
screen_name:
|
99
|
+
screen_name: screen_name,
|
85
100
|
name: html.attr('data-name'),
|
86
101
|
user_id: html.attr('data-user-id').to_i,
|
87
102
|
tweet_id: tweet_id,
|
data/lib/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: twitterscraper-ruby
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.14.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- ts-3156
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-07-
|
11
|
+
date: 2020-07-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -46,8 +46,10 @@ executables:
|
|
46
46
|
extensions: []
|
47
47
|
extra_rdoc_files: []
|
48
48
|
files:
|
49
|
+
- ".circleci/config.yml"
|
49
50
|
- ".gitignore"
|
50
51
|
- ".irbrc"
|
52
|
+
- ".rspec"
|
51
53
|
- ".ruby-version"
|
52
54
|
- ".travis.yml"
|
53
55
|
- CODE_OF_CONDUCT.md
|
@@ -61,6 +63,7 @@ files:
|
|
61
63
|
- bin/twitterscraper
|
62
64
|
- lib/twitterscraper-ruby.rb
|
63
65
|
- lib/twitterscraper.rb
|
66
|
+
- lib/twitterscraper/cache.rb
|
64
67
|
- lib/twitterscraper/cli.rb
|
65
68
|
- lib/twitterscraper/client.rb
|
66
69
|
- lib/twitterscraper/http.rb
|
@@ -68,6 +71,7 @@ files:
|
|
68
71
|
- lib/twitterscraper/logger.rb
|
69
72
|
- lib/twitterscraper/proxy.rb
|
70
73
|
- lib/twitterscraper/query.rb
|
74
|
+
- lib/twitterscraper/template.rb
|
71
75
|
- lib/twitterscraper/tweet.rb
|
72
76
|
- lib/version.rb
|
73
77
|
- twitterscraper-ruby.gemspec
|