twitterscraper-ruby 0.5.0 → 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/lib/twitterscraper.rb +1 -2
- data/lib/twitterscraper/cli.rb +57 -10
- data/lib/twitterscraper/proxy.rb +6 -6
- data/lib/twitterscraper/query.rb +5 -4
- data/lib/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: fe6db831d59218f3e701e0211487d79ef2354524610f8338a1a17e4cc426e437
|
4
|
+
data.tar.gz: 34cd890b8d2837bcacb3ab7b03fb43845294eecb9d7b0c4891c048eacedbe233
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6c16c89ca290cc3c9ed5fd245c5aa26e5386c95011cfa14277e774e860359495cafec1624fba0af55de98ebbb34abb599e75e210bdbb18b3b11e49bc1527b643
|
7
|
+
data.tar.gz: d54e25e0294eddf8226c0e27a1d46c6128e9066c9d04edc429b382498c0de1af7ccf2e5c1333ad2031210bbefd7f9b7edc73b9a0e79ab2bd3673674b2e648f3c
|
data/Gemfile.lock
CHANGED
data/lib/twitterscraper.rb
CHANGED
@@ -9,10 +9,9 @@ require 'version'
|
|
9
9
|
|
10
10
|
module Twitterscraper
|
11
11
|
class Error < StandardError; end
|
12
|
-
# Your code goes here...
|
13
12
|
|
14
13
|
def self.logger
|
15
|
-
@logger ||= ::Logger.new(STDOUT)
|
14
|
+
@logger ||= ::Logger.new(STDOUT, level: ::Logger::INFO)
|
16
15
|
end
|
17
16
|
|
18
17
|
def self.logger=(logger)
|
data/lib/twitterscraper/cli.rb
CHANGED
@@ -8,18 +8,24 @@ module Twitterscraper
|
|
8
8
|
class Cli
|
9
9
|
def parse
|
10
10
|
@options = parse_options(ARGV)
|
11
|
+
initialize_logger
|
11
12
|
end
|
12
13
|
|
13
14
|
def run
|
14
|
-
|
15
|
-
|
16
|
-
threads = options['threads'] ? options['threads'].to_i : 2
|
17
|
-
tweets = client.query_tweets(options['query'], limit: limit, threads: threads, start_date: options['start_date'], end_date: options['end_date'])
|
18
|
-
File.write('tweets.json', generate_json(tweets))
|
19
|
-
end
|
15
|
+
print_help || return if print_help?
|
16
|
+
print_version || return if print_version?
|
20
17
|
|
21
|
-
|
22
|
-
|
18
|
+
client = Twitterscraper::Client.new
|
19
|
+
query_options = {
|
20
|
+
start_date: options['start_date'],
|
21
|
+
end_date: options['end_date'],
|
22
|
+
lang: options['lang'],
|
23
|
+
limit: options['limit'],
|
24
|
+
threads: options['threads'],
|
25
|
+
proxy: options['proxy']
|
26
|
+
}
|
27
|
+
tweets = client.query_tweets(options['query'], query_options)
|
28
|
+
File.write(options['output'], generate_json(tweets))
|
23
29
|
end
|
24
30
|
|
25
31
|
def generate_json(tweets)
|
@@ -30,16 +36,57 @@ module Twitterscraper
|
|
30
36
|
end
|
31
37
|
end
|
32
38
|
|
39
|
+
def options
|
40
|
+
@options
|
41
|
+
end
|
42
|
+
|
33
43
|
def parse_options(argv)
|
34
|
-
argv.getopts(
|
44
|
+
options = argv.getopts(
|
35
45
|
'h',
|
46
|
+
'help',
|
47
|
+
'v',
|
48
|
+
'version',
|
36
49
|
'query:',
|
37
|
-
'limit:',
|
38
50
|
'start_date:',
|
39
51
|
'end_date:',
|
52
|
+
'lang:',
|
53
|
+
'limit:',
|
40
54
|
'threads:',
|
55
|
+
'output:',
|
56
|
+
'proxy',
|
41
57
|
'pretty',
|
58
|
+
'verbose',
|
42
59
|
)
|
60
|
+
|
61
|
+
options['lang'] ||= ''
|
62
|
+
options['limit'] = (options['limit'] || 100).to_i
|
63
|
+
options['threads'] = (options['threads'] || 2).to_i
|
64
|
+
options['output'] ||= 'tweets.json'
|
65
|
+
|
66
|
+
options
|
67
|
+
end
|
68
|
+
|
69
|
+
def initialize_logger
|
70
|
+
Twitterscraper.logger.level = ::Logger::DEBUG if options['verbose']
|
71
|
+
end
|
72
|
+
|
73
|
+
def print_help?
|
74
|
+
options['h'] || options['help']
|
75
|
+
end
|
76
|
+
|
77
|
+
def print_help
|
78
|
+
puts <<~'SHELL'
|
79
|
+
Usage:
|
80
|
+
twitterscraper --query KEYWORD --limit 100 --threads 10 --start_date 2020-07-01 --end_date 2020-07-10 --lang ja --proxy --output output.json
|
81
|
+
SHELL
|
82
|
+
end
|
83
|
+
|
84
|
+
def print_version?
|
85
|
+
options['v'] || options['version']
|
86
|
+
end
|
87
|
+
|
88
|
+
def print_version
|
89
|
+
puts "twitterscraper-#{Twitterscraper::VERSION}"
|
43
90
|
end
|
44
91
|
end
|
45
92
|
end
|
data/lib/twitterscraper/proxy.rb
CHANGED
@@ -6,9 +6,9 @@ module Twitterscraper
|
|
6
6
|
class RetryExhausted < StandardError
|
7
7
|
end
|
8
8
|
|
9
|
-
class
|
10
|
-
def initialize
|
11
|
-
@items =
|
9
|
+
class Pool
|
10
|
+
def initialize
|
11
|
+
@items = Proxy.get_proxies
|
12
12
|
@cur_index = 0
|
13
13
|
end
|
14
14
|
|
@@ -31,7 +31,6 @@ module Twitterscraper
|
|
31
31
|
def reload
|
32
32
|
@items = Proxy.get_proxies
|
33
33
|
@cur_index = 0
|
34
|
-
Twitterscraper.logger.debug "Reload #{proxies.size} proxies"
|
35
34
|
end
|
36
35
|
end
|
37
36
|
|
@@ -46,13 +45,14 @@ module Twitterscraper
|
|
46
45
|
|
47
46
|
table.xpath('tbody/tr').each do |tr|
|
48
47
|
cells = tr.xpath('td')
|
49
|
-
ip, port, https = [0, 1, 6].map { |i| cells[i].text.strip }
|
48
|
+
ip, port, anonymity, https = [0, 1, 4, 6].map { |i| cells[i].text.strip }
|
49
|
+
next unless ['elite proxy', 'anonymous'].include?(anonymity)
|
50
50
|
next if https == 'no'
|
51
51
|
proxies << ip + ':' + port
|
52
52
|
end
|
53
53
|
|
54
54
|
Twitterscraper.logger.debug "Fetch #{proxies.size} proxies"
|
55
|
-
|
55
|
+
proxies.shuffle
|
56
56
|
rescue => e
|
57
57
|
if (retries -= 1) > 0
|
58
58
|
retry
|
data/lib/twitterscraper/query.rb
CHANGED
@@ -91,7 +91,7 @@ module Twitterscraper
|
|
91
91
|
end
|
92
92
|
end
|
93
93
|
|
94
|
-
def query_tweets(query, start_date: nil, end_date: nil, limit: 100, threads: 2,
|
94
|
+
def query_tweets(query, start_date: nil, end_date: nil, lang: '', limit: 100, threads: 2, proxy: false)
|
95
95
|
start_date = start_date ? Date.parse(start_date) : Date.parse('2006-3-21')
|
96
96
|
end_date = end_date ? Date.parse(end_date) : Date.today
|
97
97
|
if start_date == end_date
|
@@ -100,19 +100,20 @@ module Twitterscraper
|
|
100
100
|
raise ':start_date must occur before :end_date.'
|
101
101
|
end
|
102
102
|
|
103
|
-
proxies = Twitterscraper::Proxy.
|
103
|
+
proxies = proxy ? Twitterscraper::Proxy::Pool.new : []
|
104
104
|
|
105
105
|
date_range = start_date.upto(end_date - 1)
|
106
106
|
queries = date_range.map { |date| query + " since:#{date} until:#{date + 1}" }
|
107
107
|
threads = queries.size if threads > queries.size
|
108
108
|
logger.info("Threads #{threads}")
|
109
109
|
|
110
|
+
headers = {'User-Agent': USER_AGENT_LIST.sample, 'X-Requested-With': 'XMLHttpRequest'}
|
111
|
+
logger.info("Headers #{headers}")
|
112
|
+
|
110
113
|
all_tweets = []
|
111
114
|
mutex = Mutex.new
|
112
115
|
|
113
116
|
Parallel.each(queries, in_threads: threads) do |query|
|
114
|
-
headers = {'User-Agent': USER_AGENT_LIST.sample, 'X-Requested-With': 'XMLHttpRequest'}
|
115
|
-
logger.info("Headers #{headers}")
|
116
117
|
|
117
118
|
pos = nil
|
118
119
|
|
data/lib/version.rb
CHANGED