twitterscraper-ruby 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 68f4f63474e0c165316575e8d5fc2ac8ec9cdb18218be2f084263effbdf78bb0
4
- data.tar.gz: d91edfdbd1cc36f4bf722e5a4673238a814846066370cd6810af483d089d1768
3
+ metadata.gz: eda9826c0c4afe8f4ee557a309d82330b0e970882e19d38d917d854ea4bd308b
4
+ data.tar.gz: 11b36f581640e7ab492b15364ed0521e7a15ad4d9b0e94d5b9d5aece36541d6a
5
5
  SHA512:
6
- metadata.gz: a7b0f2ce114a2eef72be3147d9459f7b91f15813825cd65baf10b0c891a64a9911c6b4e15db72425925e287043e46ac73cc956f088eb03c5ad960d213b4b4175
7
- data.tar.gz: 8f42a24221aebc9fa361b7dbf9bb23cc683112b03bd549456fdd6f49bf8763d5d8797c73ce1d5cfe60d534f329d649d845a741baedcefaca46a17dd194055778
6
+ metadata.gz: 990044f929c9dbcca4f17eb21730094cdc8d9aaf6b0a53eb012e55cd2738a26d3bd18dcc75456a8dd4d00a132faa1d32e4d04c2bcec5385ee1cfa554b4e7cfab
7
+ data.tar.gz: 6f50f5add0359866a2c4fa7f2ae78fb5dd96cbf3ab7525be847daee1a40015df2836b5900468159b34e24641cde7dc07267f53ee1e29ccea0401b5f85080f44b
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- twitterscraper-ruby (0.3.0)
4
+ twitterscraper-ruby (0.4.0)
5
5
  nokogiri
6
6
 
7
7
  GEM
@@ -14,13 +14,21 @@ module Twitterscraper
14
14
  client = Twitterscraper::Client.new
15
15
  limit = options['limit'] ? options['limit'].to_i : 100
16
16
  tweets = client.query_tweets(options['query'], limit: limit, start_date: options['start_date'], end_date: options['end_date'])
17
- File.write('tweets.json', ::JSON.dump(tweets))
17
+ File.write('tweets.json', generate_json(tweets))
18
18
  end
19
19
 
20
20
  def options
21
21
  @options
22
22
  end
23
23
 
24
+ def generate_json(tweets)
25
+ if options['pretty']
26
+ ::JSON.pretty_generate(tweets)
27
+ else
28
+ ::JSON.generate(tweets)
29
+ end
30
+ end
31
+
24
32
  def parse_options(argv)
25
33
  argv.getopts(
26
34
  'h',
@@ -28,6 +36,7 @@ module Twitterscraper
28
36
  'limit:',
29
37
  'start_date:',
30
38
  'end_date:',
39
+ 'pretty',
31
40
  )
32
41
  end
33
42
  end
@@ -9,6 +9,7 @@ module Twitterscraper
9
9
  if proxy
10
10
  ip, port = proxy.split(':')
11
11
  http_class = Net::HTTP::Proxy(ip, port.to_i)
12
+ Twitterscraper.logger.info("Using proxy #{proxy}")
12
13
  else
13
14
  http_class = Net::HTTP
14
15
  end
@@ -24,7 +25,8 @@ module Twitterscraper
24
25
  req[key] = value
25
26
  end
26
27
 
27
- http.request(req).body
28
+ res = http.start { http.request(req) }
29
+ res.body
28
30
  end
29
31
  end
30
32
  end
@@ -6,22 +6,51 @@ module Twitterscraper
6
6
  class RetryExhausted < StandardError
7
7
  end
8
8
 
9
+ class Result
10
+ def initialize(items)
11
+ @items = items.shuffle
12
+ @cur_index = 0
13
+ end
14
+
15
+ def sample
16
+ if @cur_index >= @items.size
17
+ reload
18
+ end
19
+ @cur_index += 1
20
+ @items[@cur_index - 1]
21
+ end
22
+
23
+ def size
24
+ @items.size
25
+ end
26
+
27
+ private
28
+
29
+ def reload
30
+ @items = Proxy.get_proxies.shuffle
31
+ @cur_index = 0
32
+ Twitterscraper.logger.debug "Reload #{proxies.size} proxies"
33
+ end
34
+ end
35
+
9
36
  module_function
10
37
 
11
38
  def get_proxies(retries = 3)
12
39
  response = Twitterscraper::Http.get(PROXY_URL)
13
40
  html = Nokogiri::HTML(response)
14
- table = html.xpath('//*[@id="proxylisttable"]').first
41
+ table = html.xpath('//table[@id="proxylisttable"]').first
15
42
 
16
43
  proxies = []
17
44
 
18
45
  table.xpath('tbody/tr').each do |tr|
19
46
  cells = tr.xpath('td')
20
- ip, port = cells[0].text.strip, cells[1].text.strip
47
+ ip, port, https = [0, 1, 6].map { |i| cells[i].text.strip }
48
+ next if https == 'no'
21
49
  proxies << ip + ':' + port
22
50
  end
23
51
 
24
- proxies
52
+ Twitterscraper.logger.debug "Fetch #{proxies.size} proxies"
53
+ Result.new(proxies)
25
54
  rescue => e
26
55
  if (retries -= 1) > 0
27
56
  retry
@@ -25,7 +25,7 @@ module Twitterscraper
25
25
  'include_available_features=1&include_entities=1&' +
26
26
  'max_position={pos}&reset_error_state=false'
27
27
 
28
- def get_query_url(query, lang, pos, from_user = false)
28
+ def build_query_url(query, lang, pos, from_user = false)
29
29
  # if from_user
30
30
  # if !pos
31
31
  # INIT_URL_USER.format(u = query)
@@ -40,52 +40,45 @@ module Twitterscraper
40
40
  end
41
41
  end
42
42
 
43
- def query_single_page(query, lang, pos, retries = 30, from_user = false, timeout = 3, headers: [], proxies: [])
43
+ def get_single_page(url, headers, proxies, timeout = 10, retries = 30)
44
+ Twitterscraper::Http.get(url, headers, proxies.sample, timeout)
45
+ rescue => e
46
+ logger.debug "query_single_page: #{e.inspect}"
47
+ if (retries -= 1) > 0
48
+ logger.info("Retrying... (Attempts left: #{retries - 1})")
49
+ retry
50
+ else
51
+ raise
52
+ end
53
+ end
54
+
55
+ def parse_single_page(text, html = true)
56
+ if html
57
+ json_resp = nil
58
+ items_html = text
59
+ else
60
+ json_resp = JSON.parse(text)
61
+ items_html = json_resp['items_html'] || ''
62
+ logger.debug json_resp['message'] if json_resp['message'] # Sorry, you are rate limited.
63
+ end
64
+
65
+ [items_html, json_resp]
66
+ end
67
+
68
+ def query_single_page(query, lang, pos, from_user = false, headers: [], proxies: [])
44
69
  query = query.gsub(' ', '%20').gsub('#', '%23').gsub(':', '%3A').gsub('&', '%26')
45
70
  logger.info("Querying #{query}")
46
71
 
47
- url = get_query_url(query, lang, pos, from_user)
72
+ url = build_query_url(query, lang, pos, from_user)
48
73
  logger.debug("Scraping tweets from #{url}")
49
74
 
50
- response = nil
51
- begin
52
- proxy = proxies.sample
53
- logger.info("Using proxy #{proxy}")
54
-
55
- response = Twitterscraper::Http.get(url, headers, proxy, timeout)
56
- rescue => e
57
- logger.debug "query_single_page: #{e.inspect}"
58
- if (retries -= 1) > 0
59
- logger.info("Retrying... (Attempts left: #{retries - 1})")
60
- retry
61
- else
62
- raise
63
- end
64
- end
65
-
66
- html = ''
67
- json_resp = nil
68
-
69
- if pos
70
- begin
71
- json_resp = JSON.parse(response)
72
- html = json_resp['items_html'] || ''
73
- rescue => e
74
- logger.warn("Failed to parse JSON #{e.inspect} while requesting #{url}")
75
- end
76
- else
77
- html = response || ''
78
- end
75
+ response = get_single_page(url, headers, proxies)
76
+ html, json_resp = parse_single_page(response, pos.nil?)
79
77
 
80
78
  tweets = Tweet.from_html(html)
81
79
 
82
80
  if tweets.empty?
83
- if json_resp && json_resp['has_more_items']
84
- pos = json_resp['min_position']
85
- else
86
- pos = nil
87
- end
88
- return [], pos
81
+ return [], (json_resp && json_resp['has_more_items'] && json_resp['min_position'])
89
82
  end
90
83
 
91
84
  if json_resp
@@ -103,7 +96,7 @@ module Twitterscraper
103
96
  if start_date == end_date
104
97
  raise 'Please specify different values for :start_date and :end_date.'
105
98
  elsif start_date > end_date
106
- raise 'The :start_date must occur before :end_date.'
99
+ raise ':start_date must occur before :end_date.'
107
100
  end
108
101
 
109
102
  # TODO parallel
@@ -112,7 +105,6 @@ module Twitterscraper
112
105
  all_tweets = []
113
106
 
114
107
  proxies = Twitterscraper::Proxy.get_proxies
115
- logger.info "Using #{proxies.size} proxies"
116
108
 
117
109
  headers = {'User-Agent': USER_AGENT, 'X-Requested-With': 'XMLHttpRequest'}
118
110
  logger.info("Headers #{headers}")
@@ -124,13 +116,11 @@ module Twitterscraper
124
116
 
125
117
  while true
126
118
  new_tweets, new_pos = query_single_page(queries, lang, pos, headers: headers, proxies: proxies)
127
- logger.info("Got #{new_tweets.size} tweets")
128
- logger.debug("new_pos=#{new_pos}")
129
-
130
119
  unless new_tweets.empty?
131
120
  all_tweets.concat(new_tweets)
132
121
  all_tweets.uniq! { |t| t.tweet_id }
133
122
  end
123
+ logger.info("Got #{new_tweets.size} tweets (total #{all_tweets.size})")
134
124
 
135
125
  break unless new_pos
136
126
  break if all_tweets.size >= limit
@@ -138,7 +128,10 @@ module Twitterscraper
138
128
  pos = new_pos
139
129
  end
140
130
 
141
- break if all_tweets.size >= limit
131
+ if all_tweets.size >= limit
132
+ logger.info("Reached limit #{all_tweets.size}")
133
+ break
134
+ end
142
135
  end
143
136
 
144
137
  all_tweets
@@ -1,3 +1,3 @@
1
1
  module Twitterscraper
2
- VERSION = "0.3.0"
2
+ VERSION = "0.4.0"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: twitterscraper-ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - ts-3156