twitterscraper-ruby 0.5.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e6701ff59f3eb13db9e3b2d024ea983264b528194d64f4f03a95f3576338ed77
4
- data.tar.gz: 74106816dd406ef1b355b4d4fc94b1baf4509465f6d5bf1ea8f7c654e518eec0
3
+ metadata.gz: fe6db831d59218f3e701e0211487d79ef2354524610f8338a1a17e4cc426e437
4
+ data.tar.gz: 34cd890b8d2837bcacb3ab7b03fb43845294eecb9d7b0c4891c048eacedbe233
5
5
  SHA512:
6
- metadata.gz: 9710fb74c90dcbc17a22dd613cfe4dce75106951f1e55cd9cfa94a825ecf0b6773a2851ff1cca842f83b5207d3744ac63bcce29031061b0a0ac84cc12d62b8a3
7
- data.tar.gz: f0e7cd90ecb773a1837be9245b83f51d60f25d48cab716e3584a8d3e1b6f0fe4951eadaf034850599205209d2bf8d7cd4ebfda97f9f862a528c393a2f81887a7
6
+ metadata.gz: 6c16c89ca290cc3c9ed5fd245c5aa26e5386c95011cfa14277e774e860359495cafec1624fba0af55de98ebbb34abb599e75e210bdbb18b3b11e49bc1527b643
7
+ data.tar.gz: d54e25e0294eddf8226c0e27a1d46c6128e9066c9d04edc429b382498c0de1af7ccf2e5c1333ad2031210bbefd7f9b7edc73b9a0e79ab2bd3673674b2e648f3c
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- twitterscraper-ruby (0.5.0)
4
+ twitterscraper-ruby (0.6.0)
5
5
  nokogiri
6
6
  parallel
7
7
 
@@ -9,10 +9,9 @@ require 'version'
9
9
 
10
10
  module Twitterscraper
11
11
  class Error < StandardError; end
12
- # Your code goes here...
13
12
 
14
13
  def self.logger
15
- @logger ||= ::Logger.new(STDOUT)
14
+ @logger ||= ::Logger.new(STDOUT, level: ::Logger::INFO)
16
15
  end
17
16
 
18
17
  def self.logger=(logger)
@@ -8,18 +8,24 @@ module Twitterscraper
8
8
  class Cli
9
9
  def parse
10
10
  @options = parse_options(ARGV)
11
+ initialize_logger
11
12
  end
12
13
 
13
14
  def run
14
- client = Twitterscraper::Client.new
15
- limit = options['limit'] ? options['limit'].to_i : 100
16
- threads = options['threads'] ? options['threads'].to_i : 2
17
- tweets = client.query_tweets(options['query'], limit: limit, threads: threads, start_date: options['start_date'], end_date: options['end_date'])
18
- File.write('tweets.json', generate_json(tweets))
19
- end
15
+ print_help || return if print_help?
16
+ print_version || return if print_version?
20
17
 
21
- def options
22
- @options
18
+ client = Twitterscraper::Client.new
19
+ query_options = {
20
+ start_date: options['start_date'],
21
+ end_date: options['end_date'],
22
+ lang: options['lang'],
23
+ limit: options['limit'],
24
+ threads: options['threads'],
25
+ proxy: options['proxy']
26
+ }
27
+ tweets = client.query_tweets(options['query'], query_options)
28
+ File.write(options['output'], generate_json(tweets))
23
29
  end
24
30
 
25
31
  def generate_json(tweets)
@@ -30,16 +36,57 @@ module Twitterscraper
30
36
  end
31
37
  end
32
38
 
39
+ def options
40
+ @options
41
+ end
42
+
33
43
  def parse_options(argv)
34
- argv.getopts(
44
+ options = argv.getopts(
35
45
  'h',
46
+ 'help',
47
+ 'v',
48
+ 'version',
36
49
  'query:',
37
- 'limit:',
38
50
  'start_date:',
39
51
  'end_date:',
52
+ 'lang:',
53
+ 'limit:',
40
54
  'threads:',
55
+ 'output:',
56
+ 'proxy',
41
57
  'pretty',
58
+ 'verbose',
42
59
  )
60
+
61
+ options['lang'] ||= ''
62
+ options['limit'] = (options['limit'] || 100).to_i
63
+ options['threads'] = (options['threads'] || 2).to_i
64
+ options['output'] ||= 'tweets.json'
65
+
66
+ options
67
+ end
68
+
69
+ def initialize_logger
70
+ Twitterscraper.logger.level = ::Logger::DEBUG if options['verbose']
71
+ end
72
+
73
+ def print_help?
74
+ options['h'] || options['help']
75
+ end
76
+
77
+ def print_help
78
+ puts <<~'SHELL'
79
+ Usage:
80
+ twitterscraper --query KEYWORD --limit 100 --threads 10 --start_date 2020-07-01 --end_date 2020-07-10 --lang ja --proxy --output output.json
81
+ SHELL
82
+ end
83
+
84
+ def print_version?
85
+ options['v'] || options['version']
86
+ end
87
+
88
+ def print_version
89
+ puts "twitterscraper-#{Twitterscraper::VERSION}"
43
90
  end
44
91
  end
45
92
  end
@@ -6,9 +6,9 @@ module Twitterscraper
6
6
  class RetryExhausted < StandardError
7
7
  end
8
8
 
9
- class Result
10
- def initialize(items)
11
- @items = items
9
+ class Pool
10
+ def initialize
11
+ @items = Proxy.get_proxies
12
12
  @cur_index = 0
13
13
  end
14
14
 
@@ -31,7 +31,6 @@ module Twitterscraper
31
31
  def reload
32
32
  @items = Proxy.get_proxies
33
33
  @cur_index = 0
34
- Twitterscraper.logger.debug "Reload #{proxies.size} proxies"
35
34
  end
36
35
  end
37
36
 
@@ -46,13 +45,14 @@ module Twitterscraper
46
45
 
47
46
  table.xpath('tbody/tr').each do |tr|
48
47
  cells = tr.xpath('td')
49
- ip, port, https = [0, 1, 6].map { |i| cells[i].text.strip }
48
+ ip, port, anonymity, https = [0, 1, 4, 6].map { |i| cells[i].text.strip }
49
+ next unless ['elite proxy', 'anonymous'].include?(anonymity)
50
50
  next if https == 'no'
51
51
  proxies << ip + ':' + port
52
52
  end
53
53
 
54
54
  Twitterscraper.logger.debug "Fetch #{proxies.size} proxies"
55
- Result.new(proxies.shuffle)
55
+ proxies.shuffle
56
56
  rescue => e
57
57
  if (retries -= 1) > 0
58
58
  retry
@@ -91,7 +91,7 @@ module Twitterscraper
91
91
  end
92
92
  end
93
93
 
94
- def query_tweets(query, start_date: nil, end_date: nil, limit: 100, threads: 2, lang: '')
94
+ def query_tweets(query, start_date: nil, end_date: nil, lang: '', limit: 100, threads: 2, proxy: false)
95
95
  start_date = start_date ? Date.parse(start_date) : Date.parse('2006-3-21')
96
96
  end_date = end_date ? Date.parse(end_date) : Date.today
97
97
  if start_date == end_date
@@ -100,19 +100,20 @@ module Twitterscraper
100
100
  raise ':start_date must occur before :end_date.'
101
101
  end
102
102
 
103
- proxies = Twitterscraper::Proxy.get_proxies
103
+ proxies = proxy ? Twitterscraper::Proxy::Pool.new : []
104
104
 
105
105
  date_range = start_date.upto(end_date - 1)
106
106
  queries = date_range.map { |date| query + " since:#{date} until:#{date + 1}" }
107
107
  threads = queries.size if threads > queries.size
108
108
  logger.info("Threads #{threads}")
109
109
 
110
+ headers = {'User-Agent': USER_AGENT_LIST.sample, 'X-Requested-With': 'XMLHttpRequest'}
111
+ logger.info("Headers #{headers}")
112
+
110
113
  all_tweets = []
111
114
  mutex = Mutex.new
112
115
 
113
116
  Parallel.each(queries, in_threads: threads) do |query|
114
- headers = {'User-Agent': USER_AGENT_LIST.sample, 'X-Requested-With': 'XMLHttpRequest'}
115
- logger.info("Headers #{headers}")
116
117
 
117
118
  pos = nil
118
119
 
@@ -1,3 +1,3 @@
1
1
  module Twitterscraper
2
- VERSION = '0.5.0'
2
+ VERSION = '0.6.0'
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: twitterscraper-ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.0
4
+ version: 0.6.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - ts-3156