twitterscraper-ruby 0.4.0 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: eda9826c0c4afe8f4ee557a309d82330b0e970882e19d38d917d854ea4bd308b
4
- data.tar.gz: 11b36f581640e7ab492b15364ed0521e7a15ad4d9b0e94d5b9d5aece36541d6a
3
+ metadata.gz: e6701ff59f3eb13db9e3b2d024ea983264b528194d64f4f03a95f3576338ed77
4
+ data.tar.gz: 74106816dd406ef1b355b4d4fc94b1baf4509465f6d5bf1ea8f7c654e518eec0
5
5
  SHA512:
6
- metadata.gz: 990044f929c9dbcca4f17eb21730094cdc8d9aaf6b0a53eb012e55cd2738a26d3bd18dcc75456a8dd4d00a132faa1d32e4d04c2bcec5385ee1cfa554b4e7cfab
7
- data.tar.gz: 6f50f5add0359866a2c4fa7f2ae78fb5dd96cbf3ab7525be847daee1a40015df2836b5900468159b34e24641cde7dc07267f53ee1e29ccea0401b5f85080f44b
6
+ metadata.gz: 9710fb74c90dcbc17a22dd613cfe4dce75106951f1e55cd9cfa94a825ecf0b6773a2851ff1cca842f83b5207d3744ac63bcce29031061b0a0ac84cc12d62b8a3
7
+ data.tar.gz: f0e7cd90ecb773a1837be9245b83f51d60f25d48cab716e3584a8d3e1b6f0fe4951eadaf034850599205209d2bf8d7cd4ebfda97f9f862a528c393a2f81887a7
@@ -1,8 +1,9 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- twitterscraper-ruby (0.4.0)
4
+ twitterscraper-ruby (0.5.0)
5
5
  nokogiri
6
+ parallel
6
7
 
7
8
  GEM
8
9
  remote: https://rubygems.org/
@@ -11,6 +12,7 @@ GEM
11
12
  minitest (5.14.1)
12
13
  nokogiri (1.10.10)
13
14
  mini_portile2 (~> 2.4.0)
15
+ parallel (1.19.2)
14
16
  rake (12.3.3)
15
17
 
16
18
  PLATFORMS
@@ -13,7 +13,8 @@ module Twitterscraper
13
13
  def run
14
14
  client = Twitterscraper::Client.new
15
15
  limit = options['limit'] ? options['limit'].to_i : 100
16
- tweets = client.query_tweets(options['query'], limit: limit, start_date: options['start_date'], end_date: options['end_date'])
16
+ threads = options['threads'] ? options['threads'].to_i : 2
17
+ tweets = client.query_tweets(options['query'], limit: limit, threads: threads, start_date: options['start_date'], end_date: options['end_date'])
17
18
  File.write('tweets.json', generate_json(tweets))
18
19
  end
19
20
 
@@ -36,6 +37,7 @@ module Twitterscraper
36
37
  'limit:',
37
38
  'start_date:',
38
39
  'end_date:',
40
+ 'threads:',
39
41
  'pretty',
40
42
  )
41
43
  end
@@ -9,7 +9,6 @@ module Twitterscraper
9
9
  if proxy
10
10
  ip, port = proxy.split(':')
11
11
  http_class = Net::HTTP::Proxy(ip, port.to_i)
12
- Twitterscraper.logger.info("Using proxy #{proxy}")
13
12
  else
14
13
  http_class = Net::HTTP
15
14
  end
@@ -8,7 +8,7 @@ module Twitterscraper
8
8
 
9
9
  class Result
10
10
  def initialize(items)
11
- @items = items.shuffle
11
+ @items = items
12
12
  @cur_index = 0
13
13
  end
14
14
 
@@ -17,7 +17,9 @@ module Twitterscraper
17
17
  reload
18
18
  end
19
19
  @cur_index += 1
20
- @items[@cur_index - 1]
20
+ item = @items[@cur_index - 1]
21
+ Twitterscraper.logger.info("Using proxy #{item}")
22
+ item
21
23
  end
22
24
 
23
25
  def size
@@ -27,7 +29,7 @@ module Twitterscraper
27
29
  private
28
30
 
29
31
  def reload
30
- @items = Proxy.get_proxies.shuffle
32
+ @items = Proxy.get_proxies
31
33
  @cur_index = 0
32
34
  Twitterscraper.logger.debug "Reload #{proxies.size} proxies"
33
35
  end
@@ -50,7 +52,7 @@ module Twitterscraper
50
52
  end
51
53
 
52
54
  Twitterscraper.logger.debug "Fetch #{proxies.size} proxies"
53
- Result.new(proxies)
55
+ Result.new(proxies.shuffle)
54
56
  rescue => e
55
57
  if (retries -= 1) > 0
56
58
  retry
@@ -1,7 +1,9 @@
1
+ require 'resolv-replace'
1
2
  require 'net/http'
2
3
  require 'nokogiri'
3
4
  require 'date'
4
5
  require 'json'
6
+ require 'parallel'
5
7
 
6
8
  module Twitterscraper
7
9
  module Query
@@ -14,7 +16,6 @@ module Twitterscraper
14
16
  'Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16',
15
17
  'Mozilla/5.0 (Windows NT 5.2; RW; rv:7.0a1) Gecko/20091211 SeaMonkey/9.23a1pre',
16
18
  ]
17
- USER_AGENT = USER_AGENT_LIST.sample
18
19
 
19
20
  INIT_URL = 'https://twitter.com/search?f=tweets&vertical=default&q=__QUERY__&l=__LANG__'
20
21
  RELOAD_URL = 'https://twitter.com/i/search/timeline?f=tweets&vertical=' +
@@ -59,15 +60,15 @@ module Twitterscraper
59
60
  else
60
61
  json_resp = JSON.parse(text)
61
62
  items_html = json_resp['items_html'] || ''
62
- logger.debug json_resp['message'] if json_resp['message'] # Sorry, you are rate limited.
63
+ logger.warn json_resp['message'] if json_resp['message'] # Sorry, you are rate limited.
63
64
  end
64
65
 
65
66
  [items_html, json_resp]
66
67
  end
67
68
 
68
69
  def query_single_page(query, lang, pos, from_user = false, headers: [], proxies: [])
69
- query = query.gsub(' ', '%20').gsub('#', '%23').gsub(':', '%3A').gsub('&', '%26')
70
70
  logger.info("Querying #{query}")
71
+ query = query.gsub(' ', '%20').gsub('#', '%23').gsub(':', '%3A').gsub('&', '%26')
71
72
 
72
73
  url = build_query_url(query, lang, pos, from_user)
73
74
  logger.debug("Scraping tweets from #{url}")
@@ -99,28 +100,31 @@ module Twitterscraper
99
100
  raise ':start_date must occur before :end_date.'
100
101
  end
101
102
 
102
- # TODO parallel
103
-
104
- pos = nil
105
- all_tweets = []
106
-
107
103
  proxies = Twitterscraper::Proxy.get_proxies
108
104
 
109
- headers = {'User-Agent': USER_AGENT, 'X-Requested-With': 'XMLHttpRequest'}
110
- logger.info("Headers #{headers}")
105
+ date_range = start_date.upto(end_date - 1)
106
+ queries = date_range.map { |date| query + " since:#{date} until:#{date + 1}" }
107
+ threads = queries.size if threads > queries.size
108
+ logger.info("Threads #{threads}")
109
+
110
+ all_tweets = []
111
+ mutex = Mutex.new
111
112
 
112
- start_date.upto(end_date) do |date|
113
- break if date == end_date
113
+ Parallel.each(queries, in_threads: threads) do |query|
114
+ headers = {'User-Agent': USER_AGENT_LIST.sample, 'X-Requested-With': 'XMLHttpRequest'}
115
+ logger.info("Headers #{headers}")
114
116
 
115
- queries = query + " since:#{date} until:#{date + 1}"
117
+ pos = nil
116
118
 
117
119
  while true
118
- new_tweets, new_pos = query_single_page(queries, lang, pos, headers: headers, proxies: proxies)
120
+ new_tweets, new_pos = query_single_page(query, lang, pos, headers: headers, proxies: proxies)
119
121
  unless new_tweets.empty?
120
- all_tweets.concat(new_tweets)
121
- all_tweets.uniq! { |t| t.tweet_id }
122
+ mutex.synchronize {
123
+ all_tweets.concat(new_tweets)
124
+ all_tweets.uniq! { |t| t.tweet_id }
125
+ }
122
126
  end
123
- logger.info("Got #{new_tweets.size} tweets (total #{all_tweets.size})")
127
+ logger.info("Got #{new_tweets.size} tweets (total #{all_tweets.size}) worker=#{Parallel.worker_number}")
124
128
 
125
129
  break unless new_pos
126
130
  break if all_tweets.size >= limit
@@ -130,11 +134,11 @@ module Twitterscraper
130
134
 
131
135
  if all_tweets.size >= limit
132
136
  logger.info("Reached limit #{all_tweets.size}")
133
- break
137
+ raise Parallel::Break
134
138
  end
135
139
  end
136
140
 
137
- all_tweets
141
+ all_tweets.sort_by { |tweet| -tweet.created_at.to_i }
138
142
  end
139
143
  end
140
144
  end
@@ -1,3 +1,3 @@
1
1
  module Twitterscraper
2
- VERSION = "0.4.0"
2
+ VERSION = '0.5.0'
3
3
  end
@@ -27,4 +27,5 @@ Gem::Specification.new do |spec|
27
27
  spec.required_ruby_version = ">= 2.6.4"
28
28
 
29
29
  spec.add_dependency "nokogiri"
30
+ spec.add_dependency "parallel"
30
31
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: twitterscraper-ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - ts-3156
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-07-12 00:00:00.000000000 Z
11
+ date: 2020-07-13 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -24,6 +24,20 @@ dependencies:
24
24
  - - ">="
25
25
  - !ruby/object:Gem::Version
26
26
  version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: parallel
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
27
41
  description: A gem to scrape Tweets
28
42
  email:
29
43
  - ts_3156@yahoo.co.jp