twitterscraper-ruby 0.5.0 → 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e6701ff59f3eb13db9e3b2d024ea983264b528194d64f4f03a95f3576338ed77
4
- data.tar.gz: 74106816dd406ef1b355b4d4fc94b1baf4509465f6d5bf1ea8f7c654e518eec0
3
+ metadata.gz: fe6db831d59218f3e701e0211487d79ef2354524610f8338a1a17e4cc426e437
4
+ data.tar.gz: 34cd890b8d2837bcacb3ab7b03fb43845294eecb9d7b0c4891c048eacedbe233
5
5
  SHA512:
6
- metadata.gz: 9710fb74c90dcbc17a22dd613cfe4dce75106951f1e55cd9cfa94a825ecf0b6773a2851ff1cca842f83b5207d3744ac63bcce29031061b0a0ac84cc12d62b8a3
7
- data.tar.gz: f0e7cd90ecb773a1837be9245b83f51d60f25d48cab716e3584a8d3e1b6f0fe4951eadaf034850599205209d2bf8d7cd4ebfda97f9f862a528c393a2f81887a7
6
+ metadata.gz: 6c16c89ca290cc3c9ed5fd245c5aa26e5386c95011cfa14277e774e860359495cafec1624fba0af55de98ebbb34abb599e75e210bdbb18b3b11e49bc1527b643
7
+ data.tar.gz: d54e25e0294eddf8226c0e27a1d46c6128e9066c9d04edc429b382498c0de1af7ccf2e5c1333ad2031210bbefd7f9b7edc73b9a0e79ab2bd3673674b2e648f3c
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- twitterscraper-ruby (0.5.0)
4
+ twitterscraper-ruby (0.6.0)
5
5
  nokogiri
6
6
  parallel
7
7
 
@@ -9,10 +9,9 @@ require 'version'
9
9
 
10
10
  module Twitterscraper
11
11
  class Error < StandardError; end
12
- # Your code goes here...
13
12
 
14
13
  def self.logger
15
- @logger ||= ::Logger.new(STDOUT)
14
+ @logger ||= ::Logger.new(STDOUT, level: ::Logger::INFO)
16
15
  end
17
16
 
18
17
  def self.logger=(logger)
@@ -8,18 +8,24 @@ module Twitterscraper
8
8
  class Cli
9
9
  def parse
10
10
  @options = parse_options(ARGV)
11
+ initialize_logger
11
12
  end
12
13
 
13
14
  def run
14
- client = Twitterscraper::Client.new
15
- limit = options['limit'] ? options['limit'].to_i : 100
16
- threads = options['threads'] ? options['threads'].to_i : 2
17
- tweets = client.query_tweets(options['query'], limit: limit, threads: threads, start_date: options['start_date'], end_date: options['end_date'])
18
- File.write('tweets.json', generate_json(tweets))
19
- end
15
+ print_help || return if print_help?
16
+ print_version || return if print_version?
20
17
 
21
- def options
22
- @options
18
+ client = Twitterscraper::Client.new
19
+ query_options = {
20
+ start_date: options['start_date'],
21
+ end_date: options['end_date'],
22
+ lang: options['lang'],
23
+ limit: options['limit'],
24
+ threads: options['threads'],
25
+ proxy: options['proxy']
26
+ }
27
+ tweets = client.query_tweets(options['query'], query_options)
28
+ File.write(options['output'], generate_json(tweets))
23
29
  end
24
30
 
25
31
  def generate_json(tweets)
@@ -30,16 +36,57 @@ module Twitterscraper
30
36
  end
31
37
  end
32
38
 
39
+ def options
40
+ @options
41
+ end
42
+
33
43
  def parse_options(argv)
34
- argv.getopts(
44
+ options = argv.getopts(
35
45
  'h',
46
+ 'help',
47
+ 'v',
48
+ 'version',
36
49
  'query:',
37
- 'limit:',
38
50
  'start_date:',
39
51
  'end_date:',
52
+ 'lang:',
53
+ 'limit:',
40
54
  'threads:',
55
+ 'output:',
56
+ 'proxy',
41
57
  'pretty',
58
+ 'verbose',
42
59
  )
60
+
61
+ options['lang'] ||= ''
62
+ options['limit'] = (options['limit'] || 100).to_i
63
+ options['threads'] = (options['threads'] || 2).to_i
64
+ options['output'] ||= 'tweets.json'
65
+
66
+ options
67
+ end
68
+
69
+ def initialize_logger
70
+ Twitterscraper.logger.level = ::Logger::DEBUG if options['verbose']
71
+ end
72
+
73
+ def print_help?
74
+ options['h'] || options['help']
75
+ end
76
+
77
+ def print_help
78
+ puts <<~'SHELL'
79
+ Usage:
80
+ twitterscraper --query KEYWORD --limit 100 --threads 10 --start_date 2020-07-01 --end_date 2020-07-10 --lang ja --proxy --output output.json
81
+ SHELL
82
+ end
83
+
84
+ def print_version?
85
+ options['v'] || options['version']
86
+ end
87
+
88
+ def print_version
89
+ puts "twitterscraper-#{Twitterscraper::VERSION}"
43
90
  end
44
91
  end
45
92
  end
@@ -6,9 +6,9 @@ module Twitterscraper
6
6
  class RetryExhausted < StandardError
7
7
  end
8
8
 
9
- class Result
10
- def initialize(items)
11
- @items = items
9
+ class Pool
10
+ def initialize
11
+ @items = Proxy.get_proxies
12
12
  @cur_index = 0
13
13
  end
14
14
 
@@ -31,7 +31,6 @@ module Twitterscraper
31
31
  def reload
32
32
  @items = Proxy.get_proxies
33
33
  @cur_index = 0
34
- Twitterscraper.logger.debug "Reload #{proxies.size} proxies"
35
34
  end
36
35
  end
37
36
 
@@ -46,13 +45,14 @@ module Twitterscraper
46
45
 
47
46
  table.xpath('tbody/tr').each do |tr|
48
47
  cells = tr.xpath('td')
49
- ip, port, https = [0, 1, 6].map { |i| cells[i].text.strip }
48
+ ip, port, anonymity, https = [0, 1, 4, 6].map { |i| cells[i].text.strip }
49
+ next unless ['elite proxy', 'anonymous'].include?(anonymity)
50
50
  next if https == 'no'
51
51
  proxies << ip + ':' + port
52
52
  end
53
53
 
54
54
  Twitterscraper.logger.debug "Fetch #{proxies.size} proxies"
55
- Result.new(proxies.shuffle)
55
+ proxies.shuffle
56
56
  rescue => e
57
57
  if (retries -= 1) > 0
58
58
  retry
@@ -91,7 +91,7 @@ module Twitterscraper
91
91
  end
92
92
  end
93
93
 
94
- def query_tweets(query, start_date: nil, end_date: nil, limit: 100, threads: 2, lang: '')
94
+ def query_tweets(query, start_date: nil, end_date: nil, lang: '', limit: 100, threads: 2, proxy: false)
95
95
  start_date = start_date ? Date.parse(start_date) : Date.parse('2006-3-21')
96
96
  end_date = end_date ? Date.parse(end_date) : Date.today
97
97
  if start_date == end_date
@@ -100,19 +100,20 @@ module Twitterscraper
100
100
  raise ':start_date must occur before :end_date.'
101
101
  end
102
102
 
103
- proxies = Twitterscraper::Proxy.get_proxies
103
+ proxies = proxy ? Twitterscraper::Proxy::Pool.new : []
104
104
 
105
105
  date_range = start_date.upto(end_date - 1)
106
106
  queries = date_range.map { |date| query + " since:#{date} until:#{date + 1}" }
107
107
  threads = queries.size if threads > queries.size
108
108
  logger.info("Threads #{threads}")
109
109
 
110
+ headers = {'User-Agent': USER_AGENT_LIST.sample, 'X-Requested-With': 'XMLHttpRequest'}
111
+ logger.info("Headers #{headers}")
112
+
110
113
  all_tweets = []
111
114
  mutex = Mutex.new
112
115
 
113
116
  Parallel.each(queries, in_threads: threads) do |query|
114
- headers = {'User-Agent': USER_AGENT_LIST.sample, 'X-Requested-With': 'XMLHttpRequest'}
115
- logger.info("Headers #{headers}")
116
117
 
117
118
  pos = nil
118
119
 
@@ -1,3 +1,3 @@
1
1
  module Twitterscraper
2
- VERSION = '0.5.0'
2
+ VERSION = '0.6.0'
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: twitterscraper-ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.0
4
+ version: 0.6.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - ts-3156