twitterscraper-ruby 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/lib/twitterscraper/cli.rb +10 -1
- data/lib/twitterscraper/http.rb +3 -1
- data/lib/twitterscraper/proxy.rb +32 -3
- data/lib/twitterscraper/query.rb +37 -44
- data/lib/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: eda9826c0c4afe8f4ee557a309d82330b0e970882e19d38d917d854ea4bd308b
|
4
|
+
data.tar.gz: 11b36f581640e7ab492b15364ed0521e7a15ad4d9b0e94d5b9d5aece36541d6a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 990044f929c9dbcca4f17eb21730094cdc8d9aaf6b0a53eb012e55cd2738a26d3bd18dcc75456a8dd4d00a132faa1d32e4d04c2bcec5385ee1cfa554b4e7cfab
|
7
|
+
data.tar.gz: 6f50f5add0359866a2c4fa7f2ae78fb5dd96cbf3ab7525be847daee1a40015df2836b5900468159b34e24641cde7dc07267f53ee1e29ccea0401b5f85080f44b
|
data/Gemfile.lock
CHANGED
data/lib/twitterscraper/cli.rb
CHANGED
@@ -14,13 +14,21 @@ module Twitterscraper
|
|
14
14
|
client = Twitterscraper::Client.new
|
15
15
|
limit = options['limit'] ? options['limit'].to_i : 100
|
16
16
|
tweets = client.query_tweets(options['query'], limit: limit, start_date: options['start_date'], end_date: options['end_date'])
|
17
|
-
File.write('tweets.json',
|
17
|
+
File.write('tweets.json', generate_json(tweets))
|
18
18
|
end
|
19
19
|
|
20
20
|
def options
|
21
21
|
@options
|
22
22
|
end
|
23
23
|
|
24
|
+
def generate_json(tweets)
|
25
|
+
if options['pretty']
|
26
|
+
::JSON.pretty_generate(tweets)
|
27
|
+
else
|
28
|
+
::JSON.generate(tweets)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
24
32
|
def parse_options(argv)
|
25
33
|
argv.getopts(
|
26
34
|
'h',
|
@@ -28,6 +36,7 @@ module Twitterscraper
|
|
28
36
|
'limit:',
|
29
37
|
'start_date:',
|
30
38
|
'end_date:',
|
39
|
+
'pretty',
|
31
40
|
)
|
32
41
|
end
|
33
42
|
end
|
data/lib/twitterscraper/http.rb
CHANGED
@@ -9,6 +9,7 @@ module Twitterscraper
|
|
9
9
|
if proxy
|
10
10
|
ip, port = proxy.split(':')
|
11
11
|
http_class = Net::HTTP::Proxy(ip, port.to_i)
|
12
|
+
Twitterscraper.logger.info("Using proxy #{proxy}")
|
12
13
|
else
|
13
14
|
http_class = Net::HTTP
|
14
15
|
end
|
@@ -24,7 +25,8 @@ module Twitterscraper
|
|
24
25
|
req[key] = value
|
25
26
|
end
|
26
27
|
|
27
|
-
http.request(req)
|
28
|
+
res = http.start { http.request(req) }
|
29
|
+
res.body
|
28
30
|
end
|
29
31
|
end
|
30
32
|
end
|
data/lib/twitterscraper/proxy.rb
CHANGED
@@ -6,22 +6,51 @@ module Twitterscraper
|
|
6
6
|
class RetryExhausted < StandardError
|
7
7
|
end
|
8
8
|
|
9
|
+
class Result
|
10
|
+
def initialize(items)
|
11
|
+
@items = items.shuffle
|
12
|
+
@cur_index = 0
|
13
|
+
end
|
14
|
+
|
15
|
+
def sample
|
16
|
+
if @cur_index >= @items.size
|
17
|
+
reload
|
18
|
+
end
|
19
|
+
@cur_index += 1
|
20
|
+
@items[@cur_index - 1]
|
21
|
+
end
|
22
|
+
|
23
|
+
def size
|
24
|
+
@items.size
|
25
|
+
end
|
26
|
+
|
27
|
+
private
|
28
|
+
|
29
|
+
def reload
|
30
|
+
@items = Proxy.get_proxies.shuffle
|
31
|
+
@cur_index = 0
|
32
|
+
Twitterscraper.logger.debug "Reload #{proxies.size} proxies"
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
9
36
|
module_function
|
10
37
|
|
11
38
|
def get_proxies(retries = 3)
|
12
39
|
response = Twitterscraper::Http.get(PROXY_URL)
|
13
40
|
html = Nokogiri::HTML(response)
|
14
|
-
table = html.xpath('
|
41
|
+
table = html.xpath('//table[@id="proxylisttable"]').first
|
15
42
|
|
16
43
|
proxies = []
|
17
44
|
|
18
45
|
table.xpath('tbody/tr').each do |tr|
|
19
46
|
cells = tr.xpath('td')
|
20
|
-
ip, port =
|
47
|
+
ip, port, https = [0, 1, 6].map { |i| cells[i].text.strip }
|
48
|
+
next if https == 'no'
|
21
49
|
proxies << ip + ':' + port
|
22
50
|
end
|
23
51
|
|
24
|
-
proxies
|
52
|
+
Twitterscraper.logger.debug "Fetch #{proxies.size} proxies"
|
53
|
+
Result.new(proxies)
|
25
54
|
rescue => e
|
26
55
|
if (retries -= 1) > 0
|
27
56
|
retry
|
data/lib/twitterscraper/query.rb
CHANGED
@@ -25,7 +25,7 @@ module Twitterscraper
|
|
25
25
|
'include_available_features=1&include_entities=1&' +
|
26
26
|
'max_position={pos}&reset_error_state=false'
|
27
27
|
|
28
|
-
def
|
28
|
+
def build_query_url(query, lang, pos, from_user = false)
|
29
29
|
# if from_user
|
30
30
|
# if !pos
|
31
31
|
# INIT_URL_USER.format(u = query)
|
@@ -40,52 +40,45 @@ module Twitterscraper
|
|
40
40
|
end
|
41
41
|
end
|
42
42
|
|
43
|
-
def
|
43
|
+
def get_single_page(url, headers, proxies, timeout = 10, retries = 30)
|
44
|
+
Twitterscraper::Http.get(url, headers, proxies.sample, timeout)
|
45
|
+
rescue => e
|
46
|
+
logger.debug "query_single_page: #{e.inspect}"
|
47
|
+
if (retries -= 1) > 0
|
48
|
+
logger.info("Retrying... (Attempts left: #{retries - 1})")
|
49
|
+
retry
|
50
|
+
else
|
51
|
+
raise
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
def parse_single_page(text, html = true)
|
56
|
+
if html
|
57
|
+
json_resp = nil
|
58
|
+
items_html = text
|
59
|
+
else
|
60
|
+
json_resp = JSON.parse(text)
|
61
|
+
items_html = json_resp['items_html'] || ''
|
62
|
+
logger.debug json_resp['message'] if json_resp['message'] # Sorry, you are rate limited.
|
63
|
+
end
|
64
|
+
|
65
|
+
[items_html, json_resp]
|
66
|
+
end
|
67
|
+
|
68
|
+
def query_single_page(query, lang, pos, from_user = false, headers: [], proxies: [])
|
44
69
|
query = query.gsub(' ', '%20').gsub('#', '%23').gsub(':', '%3A').gsub('&', '%26')
|
45
70
|
logger.info("Querying #{query}")
|
46
71
|
|
47
|
-
url =
|
72
|
+
url = build_query_url(query, lang, pos, from_user)
|
48
73
|
logger.debug("Scraping tweets from #{url}")
|
49
74
|
|
50
|
-
response =
|
51
|
-
|
52
|
-
proxy = proxies.sample
|
53
|
-
logger.info("Using proxy #{proxy}")
|
54
|
-
|
55
|
-
response = Twitterscraper::Http.get(url, headers, proxy, timeout)
|
56
|
-
rescue => e
|
57
|
-
logger.debug "query_single_page: #{e.inspect}"
|
58
|
-
if (retries -= 1) > 0
|
59
|
-
logger.info("Retrying... (Attempts left: #{retries - 1})")
|
60
|
-
retry
|
61
|
-
else
|
62
|
-
raise
|
63
|
-
end
|
64
|
-
end
|
65
|
-
|
66
|
-
html = ''
|
67
|
-
json_resp = nil
|
68
|
-
|
69
|
-
if pos
|
70
|
-
begin
|
71
|
-
json_resp = JSON.parse(response)
|
72
|
-
html = json_resp['items_html'] || ''
|
73
|
-
rescue => e
|
74
|
-
logger.warn("Failed to parse JSON #{e.inspect} while requesting #{url}")
|
75
|
-
end
|
76
|
-
else
|
77
|
-
html = response || ''
|
78
|
-
end
|
75
|
+
response = get_single_page(url, headers, proxies)
|
76
|
+
html, json_resp = parse_single_page(response, pos.nil?)
|
79
77
|
|
80
78
|
tweets = Tweet.from_html(html)
|
81
79
|
|
82
80
|
if tweets.empty?
|
83
|
-
|
84
|
-
pos = json_resp['min_position']
|
85
|
-
else
|
86
|
-
pos = nil
|
87
|
-
end
|
88
|
-
return [], pos
|
81
|
+
return [], (json_resp && json_resp['has_more_items'] && json_resp['min_position'])
|
89
82
|
end
|
90
83
|
|
91
84
|
if json_resp
|
@@ -103,7 +96,7 @@ module Twitterscraper
|
|
103
96
|
if start_date == end_date
|
104
97
|
raise 'Please specify different values for :start_date and :end_date.'
|
105
98
|
elsif start_date > end_date
|
106
|
-
raise '
|
99
|
+
raise ':start_date must occur before :end_date.'
|
107
100
|
end
|
108
101
|
|
109
102
|
# TODO parallel
|
@@ -112,7 +105,6 @@ module Twitterscraper
|
|
112
105
|
all_tweets = []
|
113
106
|
|
114
107
|
proxies = Twitterscraper::Proxy.get_proxies
|
115
|
-
logger.info "Using #{proxies.size} proxies"
|
116
108
|
|
117
109
|
headers = {'User-Agent': USER_AGENT, 'X-Requested-With': 'XMLHttpRequest'}
|
118
110
|
logger.info("Headers #{headers}")
|
@@ -124,13 +116,11 @@ module Twitterscraper
|
|
124
116
|
|
125
117
|
while true
|
126
118
|
new_tweets, new_pos = query_single_page(queries, lang, pos, headers: headers, proxies: proxies)
|
127
|
-
logger.info("Got #{new_tweets.size} tweets")
|
128
|
-
logger.debug("new_pos=#{new_pos}")
|
129
|
-
|
130
119
|
unless new_tweets.empty?
|
131
120
|
all_tweets.concat(new_tweets)
|
132
121
|
all_tweets.uniq! { |t| t.tweet_id }
|
133
122
|
end
|
123
|
+
logger.info("Got #{new_tweets.size} tweets (total #{all_tweets.size})")
|
134
124
|
|
135
125
|
break unless new_pos
|
136
126
|
break if all_tweets.size >= limit
|
@@ -138,7 +128,10 @@ module Twitterscraper
|
|
138
128
|
pos = new_pos
|
139
129
|
end
|
140
130
|
|
141
|
-
|
131
|
+
if all_tweets.size >= limit
|
132
|
+
logger.info("Reached limit #{all_tweets.size}")
|
133
|
+
break
|
134
|
+
end
|
142
135
|
end
|
143
136
|
|
144
137
|
all_tweets
|
data/lib/version.rb
CHANGED