twitterscraper-ruby 0.4.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: eda9826c0c4afe8f4ee557a309d82330b0e970882e19d38d917d854ea4bd308b
4
- data.tar.gz: 11b36f581640e7ab492b15364ed0521e7a15ad4d9b0e94d5b9d5aece36541d6a
3
+ metadata.gz: e6701ff59f3eb13db9e3b2d024ea983264b528194d64f4f03a95f3576338ed77
4
+ data.tar.gz: 74106816dd406ef1b355b4d4fc94b1baf4509465f6d5bf1ea8f7c654e518eec0
5
5
  SHA512:
6
- metadata.gz: 990044f929c9dbcca4f17eb21730094cdc8d9aaf6b0a53eb012e55cd2738a26d3bd18dcc75456a8dd4d00a132faa1d32e4d04c2bcec5385ee1cfa554b4e7cfab
7
- data.tar.gz: 6f50f5add0359866a2c4fa7f2ae78fb5dd96cbf3ab7525be847daee1a40015df2836b5900468159b34e24641cde7dc07267f53ee1e29ccea0401b5f85080f44b
6
+ metadata.gz: 9710fb74c90dcbc17a22dd613cfe4dce75106951f1e55cd9cfa94a825ecf0b6773a2851ff1cca842f83b5207d3744ac63bcce29031061b0a0ac84cc12d62b8a3
7
+ data.tar.gz: f0e7cd90ecb773a1837be9245b83f51d60f25d48cab716e3584a8d3e1b6f0fe4951eadaf034850599205209d2bf8d7cd4ebfda97f9f862a528c393a2f81887a7
@@ -1,8 +1,9 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- twitterscraper-ruby (0.4.0)
4
+ twitterscraper-ruby (0.5.0)
5
5
  nokogiri
6
+ parallel
6
7
 
7
8
  GEM
8
9
  remote: https://rubygems.org/
@@ -11,6 +12,7 @@ GEM
11
12
  minitest (5.14.1)
12
13
  nokogiri (1.10.10)
13
14
  mini_portile2 (~> 2.4.0)
15
+ parallel (1.19.2)
14
16
  rake (12.3.3)
15
17
 
16
18
  PLATFORMS
@@ -13,7 +13,8 @@ module Twitterscraper
13
13
  def run
14
14
  client = Twitterscraper::Client.new
15
15
  limit = options['limit'] ? options['limit'].to_i : 100
16
- tweets = client.query_tweets(options['query'], limit: limit, start_date: options['start_date'], end_date: options['end_date'])
16
+ threads = options['threads'] ? options['threads'].to_i : 2
17
+ tweets = client.query_tweets(options['query'], limit: limit, threads: threads, start_date: options['start_date'], end_date: options['end_date'])
17
18
  File.write('tweets.json', generate_json(tweets))
18
19
  end
19
20
 
@@ -36,6 +37,7 @@ module Twitterscraper
36
37
  'limit:',
37
38
  'start_date:',
38
39
  'end_date:',
40
+ 'threads:',
39
41
  'pretty',
40
42
  )
41
43
  end
@@ -9,7 +9,6 @@ module Twitterscraper
9
9
  if proxy
10
10
  ip, port = proxy.split(':')
11
11
  http_class = Net::HTTP::Proxy(ip, port.to_i)
12
- Twitterscraper.logger.info("Using proxy #{proxy}")
13
12
  else
14
13
  http_class = Net::HTTP
15
14
  end
@@ -8,7 +8,7 @@ module Twitterscraper
8
8
 
9
9
  class Result
10
10
  def initialize(items)
11
- @items = items.shuffle
11
+ @items = items
12
12
  @cur_index = 0
13
13
  end
14
14
 
@@ -17,7 +17,9 @@ module Twitterscraper
17
17
  reload
18
18
  end
19
19
  @cur_index += 1
20
- @items[@cur_index - 1]
20
+ item = @items[@cur_index - 1]
21
+ Twitterscraper.logger.info("Using proxy #{item}")
22
+ item
21
23
  end
22
24
 
23
25
  def size
@@ -27,7 +29,7 @@ module Twitterscraper
27
29
  private
28
30
 
29
31
  def reload
30
- @items = Proxy.get_proxies.shuffle
32
+ @items = Proxy.get_proxies
31
33
  @cur_index = 0
32
34
  Twitterscraper.logger.debug "Reload #{proxies.size} proxies"
33
35
  end
@@ -50,7 +52,7 @@ module Twitterscraper
50
52
  end
51
53
 
52
54
  Twitterscraper.logger.debug "Fetch #{proxies.size} proxies"
53
- Result.new(proxies)
55
+ Result.new(proxies.shuffle)
54
56
  rescue => e
55
57
  if (retries -= 1) > 0
56
58
  retry
@@ -1,7 +1,9 @@
1
+ require 'resolv-replace'
1
2
  require 'net/http'
2
3
  require 'nokogiri'
3
4
  require 'date'
4
5
  require 'json'
6
+ require 'parallel'
5
7
 
6
8
  module Twitterscraper
7
9
  module Query
@@ -14,7 +16,6 @@ module Twitterscraper
14
16
  'Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16',
15
17
  'Mozilla/5.0 (Windows NT 5.2; RW; rv:7.0a1) Gecko/20091211 SeaMonkey/9.23a1pre',
16
18
  ]
17
- USER_AGENT = USER_AGENT_LIST.sample
18
19
 
19
20
  INIT_URL = 'https://twitter.com/search?f=tweets&vertical=default&q=__QUERY__&l=__LANG__'
20
21
  RELOAD_URL = 'https://twitter.com/i/search/timeline?f=tweets&vertical=' +
@@ -59,15 +60,15 @@ module Twitterscraper
59
60
  else
60
61
  json_resp = JSON.parse(text)
61
62
  items_html = json_resp['items_html'] || ''
62
- logger.debug json_resp['message'] if json_resp['message'] # Sorry, you are rate limited.
63
+ logger.warn json_resp['message'] if json_resp['message'] # Sorry, you are rate limited.
63
64
  end
64
65
 
65
66
  [items_html, json_resp]
66
67
  end
67
68
 
68
69
  def query_single_page(query, lang, pos, from_user = false, headers: [], proxies: [])
69
- query = query.gsub(' ', '%20').gsub('#', '%23').gsub(':', '%3A').gsub('&', '%26')
70
70
  logger.info("Querying #{query}")
71
+ query = query.gsub(' ', '%20').gsub('#', '%23').gsub(':', '%3A').gsub('&', '%26')
71
72
 
72
73
  url = build_query_url(query, lang, pos, from_user)
73
74
  logger.debug("Scraping tweets from #{url}")
@@ -99,28 +100,31 @@ module Twitterscraper
99
100
  raise ':start_date must occur before :end_date.'
100
101
  end
101
102
 
102
- # TODO parallel
103
-
104
- pos = nil
105
- all_tweets = []
106
-
107
103
  proxies = Twitterscraper::Proxy.get_proxies
108
104
 
109
- headers = {'User-Agent': USER_AGENT, 'X-Requested-With': 'XMLHttpRequest'}
110
- logger.info("Headers #{headers}")
105
+ date_range = start_date.upto(end_date - 1)
106
+ queries = date_range.map { |date| query + " since:#{date} until:#{date + 1}" }
107
+ threads = queries.size if threads > queries.size
108
+ logger.info("Threads #{threads}")
109
+
110
+ all_tweets = []
111
+ mutex = Mutex.new
111
112
 
112
- start_date.upto(end_date) do |date|
113
- break if date == end_date
113
+ Parallel.each(queries, in_threads: threads) do |query|
114
+ headers = {'User-Agent': USER_AGENT_LIST.sample, 'X-Requested-With': 'XMLHttpRequest'}
115
+ logger.info("Headers #{headers}")
114
116
 
115
- queries = query + " since:#{date} until:#{date + 1}"
117
+ pos = nil
116
118
 
117
119
  while true
118
- new_tweets, new_pos = query_single_page(queries, lang, pos, headers: headers, proxies: proxies)
120
+ new_tweets, new_pos = query_single_page(query, lang, pos, headers: headers, proxies: proxies)
119
121
  unless new_tweets.empty?
120
- all_tweets.concat(new_tweets)
121
- all_tweets.uniq! { |t| t.tweet_id }
122
+ mutex.synchronize {
123
+ all_tweets.concat(new_tweets)
124
+ all_tweets.uniq! { |t| t.tweet_id }
125
+ }
122
126
  end
123
- logger.info("Got #{new_tweets.size} tweets (total #{all_tweets.size})")
127
+ logger.info("Got #{new_tweets.size} tweets (total #{all_tweets.size}) worker=#{Parallel.worker_number}")
124
128
 
125
129
  break unless new_pos
126
130
  break if all_tweets.size >= limit
@@ -130,11 +134,11 @@ module Twitterscraper
130
134
 
131
135
  if all_tweets.size >= limit
132
136
  logger.info("Reached limit #{all_tweets.size}")
133
- break
137
+ raise Parallel::Break
134
138
  end
135
139
  end
136
140
 
137
- all_tweets
141
+ all_tweets.sort_by { |tweet| -tweet.created_at.to_i }
138
142
  end
139
143
  end
140
144
  end
@@ -1,3 +1,3 @@
1
1
  module Twitterscraper
2
- VERSION = "0.4.0"
2
+ VERSION = '0.5.0'
3
3
  end
@@ -27,4 +27,5 @@ Gem::Specification.new do |spec|
27
27
  spec.required_ruby_version = ">= 2.6.4"
28
28
 
29
29
  spec.add_dependency "nokogiri"
30
+ spec.add_dependency "parallel"
30
31
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: twitterscraper-ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - ts-3156
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-07-12 00:00:00.000000000 Z
11
+ date: 2020-07-13 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -24,6 +24,20 @@ dependencies:
24
24
  - - ">="
25
25
  - !ruby/object:Gem::Version
26
26
  version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: parallel
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
27
41
  description: A gem to scrape Tweets
28
42
  email:
29
43
  - ts_3156@yahoo.co.jp