twitterscraper-ruby 0.4.0 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +3 -1
- data/lib/twitterscraper/cli.rb +3 -1
- data/lib/twitterscraper/http.rb +0 -1
- data/lib/twitterscraper/proxy.rb +6 -4
- data/lib/twitterscraper/query.rb +23 -19
- data/lib/version.rb +1 -1
- data/twitterscraper-ruby.gemspec +1 -0
- metadata +16 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e6701ff59f3eb13db9e3b2d024ea983264b528194d64f4f03a95f3576338ed77
|
4
|
+
data.tar.gz: 74106816dd406ef1b355b4d4fc94b1baf4509465f6d5bf1ea8f7c654e518eec0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9710fb74c90dcbc17a22dd613cfe4dce75106951f1e55cd9cfa94a825ecf0b6773a2851ff1cca842f83b5207d3744ac63bcce29031061b0a0ac84cc12d62b8a3
|
7
|
+
data.tar.gz: f0e7cd90ecb773a1837be9245b83f51d60f25d48cab716e3584a8d3e1b6f0fe4951eadaf034850599205209d2bf8d7cd4ebfda97f9f862a528c393a2f81887a7
|
data/Gemfile.lock
CHANGED
@@ -1,8 +1,9 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
twitterscraper-ruby (0.
|
4
|
+
twitterscraper-ruby (0.5.0)
|
5
5
|
nokogiri
|
6
|
+
parallel
|
6
7
|
|
7
8
|
GEM
|
8
9
|
remote: https://rubygems.org/
|
@@ -11,6 +12,7 @@ GEM
|
|
11
12
|
minitest (5.14.1)
|
12
13
|
nokogiri (1.10.10)
|
13
14
|
mini_portile2 (~> 2.4.0)
|
15
|
+
parallel (1.19.2)
|
14
16
|
rake (12.3.3)
|
15
17
|
|
16
18
|
PLATFORMS
|
data/lib/twitterscraper/cli.rb
CHANGED
@@ -13,7 +13,8 @@ module Twitterscraper
|
|
13
13
|
def run
|
14
14
|
client = Twitterscraper::Client.new
|
15
15
|
limit = options['limit'] ? options['limit'].to_i : 100
|
16
|
-
|
16
|
+
threads = options['threads'] ? options['threads'].to_i : 2
|
17
|
+
tweets = client.query_tweets(options['query'], limit: limit, threads: threads, start_date: options['start_date'], end_date: options['end_date'])
|
17
18
|
File.write('tweets.json', generate_json(tweets))
|
18
19
|
end
|
19
20
|
|
@@ -36,6 +37,7 @@ module Twitterscraper
|
|
36
37
|
'limit:',
|
37
38
|
'start_date:',
|
38
39
|
'end_date:',
|
40
|
+
'threads:',
|
39
41
|
'pretty',
|
40
42
|
)
|
41
43
|
end
|
data/lib/twitterscraper/http.rb
CHANGED
data/lib/twitterscraper/proxy.rb
CHANGED
@@ -8,7 +8,7 @@ module Twitterscraper
|
|
8
8
|
|
9
9
|
class Result
|
10
10
|
def initialize(items)
|
11
|
-
@items = items
|
11
|
+
@items = items
|
12
12
|
@cur_index = 0
|
13
13
|
end
|
14
14
|
|
@@ -17,7 +17,9 @@ module Twitterscraper
|
|
17
17
|
reload
|
18
18
|
end
|
19
19
|
@cur_index += 1
|
20
|
-
@items[@cur_index - 1]
|
20
|
+
item = @items[@cur_index - 1]
|
21
|
+
Twitterscraper.logger.info("Using proxy #{item}")
|
22
|
+
item
|
21
23
|
end
|
22
24
|
|
23
25
|
def size
|
@@ -27,7 +29,7 @@ module Twitterscraper
|
|
27
29
|
private
|
28
30
|
|
29
31
|
def reload
|
30
|
-
@items = Proxy.get_proxies
|
32
|
+
@items = Proxy.get_proxies
|
31
33
|
@cur_index = 0
|
32
34
|
Twitterscraper.logger.debug "Reload #{proxies.size} proxies"
|
33
35
|
end
|
@@ -50,7 +52,7 @@ module Twitterscraper
|
|
50
52
|
end
|
51
53
|
|
52
54
|
Twitterscraper.logger.debug "Fetch #{proxies.size} proxies"
|
53
|
-
Result.new(proxies)
|
55
|
+
Result.new(proxies.shuffle)
|
54
56
|
rescue => e
|
55
57
|
if (retries -= 1) > 0
|
56
58
|
retry
|
data/lib/twitterscraper/query.rb
CHANGED
@@ -1,7 +1,9 @@
|
|
1
|
+
require 'resolv-replace'
|
1
2
|
require 'net/http'
|
2
3
|
require 'nokogiri'
|
3
4
|
require 'date'
|
4
5
|
require 'json'
|
6
|
+
require 'parallel'
|
5
7
|
|
6
8
|
module Twitterscraper
|
7
9
|
module Query
|
@@ -14,7 +16,6 @@ module Twitterscraper
|
|
14
16
|
'Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16',
|
15
17
|
'Mozilla/5.0 (Windows NT 5.2; RW; rv:7.0a1) Gecko/20091211 SeaMonkey/9.23a1pre',
|
16
18
|
]
|
17
|
-
USER_AGENT = USER_AGENT_LIST.sample
|
18
19
|
|
19
20
|
INIT_URL = 'https://twitter.com/search?f=tweets&vertical=default&q=__QUERY__&l=__LANG__'
|
20
21
|
RELOAD_URL = 'https://twitter.com/i/search/timeline?f=tweets&vertical=' +
|
@@ -59,15 +60,15 @@ module Twitterscraper
|
|
59
60
|
else
|
60
61
|
json_resp = JSON.parse(text)
|
61
62
|
items_html = json_resp['items_html'] || ''
|
62
|
-
logger.
|
63
|
+
logger.warn json_resp['message'] if json_resp['message'] # Sorry, you are rate limited.
|
63
64
|
end
|
64
65
|
|
65
66
|
[items_html, json_resp]
|
66
67
|
end
|
67
68
|
|
68
69
|
def query_single_page(query, lang, pos, from_user = false, headers: [], proxies: [])
|
69
|
-
query = query.gsub(' ', '%20').gsub('#', '%23').gsub(':', '%3A').gsub('&', '%26')
|
70
70
|
logger.info("Querying #{query}")
|
71
|
+
query = query.gsub(' ', '%20').gsub('#', '%23').gsub(':', '%3A').gsub('&', '%26')
|
71
72
|
|
72
73
|
url = build_query_url(query, lang, pos, from_user)
|
73
74
|
logger.debug("Scraping tweets from #{url}")
|
@@ -99,28 +100,31 @@ module Twitterscraper
|
|
99
100
|
raise ':start_date must occur before :end_date.'
|
100
101
|
end
|
101
102
|
|
102
|
-
# TODO parallel
|
103
|
-
|
104
|
-
pos = nil
|
105
|
-
all_tweets = []
|
106
|
-
|
107
103
|
proxies = Twitterscraper::Proxy.get_proxies
|
108
104
|
|
109
|
-
|
110
|
-
|
105
|
+
date_range = start_date.upto(end_date - 1)
|
106
|
+
queries = date_range.map { |date| query + " since:#{date} until:#{date + 1}" }
|
107
|
+
threads = queries.size if threads > queries.size
|
108
|
+
logger.info("Threads #{threads}")
|
109
|
+
|
110
|
+
all_tweets = []
|
111
|
+
mutex = Mutex.new
|
111
112
|
|
112
|
-
|
113
|
-
|
113
|
+
Parallel.each(queries, in_threads: threads) do |query|
|
114
|
+
headers = {'User-Agent': USER_AGENT_LIST.sample, 'X-Requested-With': 'XMLHttpRequest'}
|
115
|
+
logger.info("Headers #{headers}")
|
114
116
|
|
115
|
-
|
117
|
+
pos = nil
|
116
118
|
|
117
119
|
while true
|
118
|
-
new_tweets, new_pos = query_single_page(
|
120
|
+
new_tweets, new_pos = query_single_page(query, lang, pos, headers: headers, proxies: proxies)
|
119
121
|
unless new_tweets.empty?
|
120
|
-
|
121
|
-
|
122
|
+
mutex.synchronize {
|
123
|
+
all_tweets.concat(new_tweets)
|
124
|
+
all_tweets.uniq! { |t| t.tweet_id }
|
125
|
+
}
|
122
126
|
end
|
123
|
-
logger.info("Got #{new_tweets.size} tweets (total #{all_tweets.size})")
|
127
|
+
logger.info("Got #{new_tweets.size} tweets (total #{all_tweets.size}) worker=#{Parallel.worker_number}")
|
124
128
|
|
125
129
|
break unless new_pos
|
126
130
|
break if all_tweets.size >= limit
|
@@ -130,11 +134,11 @@ module Twitterscraper
|
|
130
134
|
|
131
135
|
if all_tweets.size >= limit
|
132
136
|
logger.info("Reached limit #{all_tweets.size}")
|
133
|
-
|
137
|
+
raise Parallel::Break
|
134
138
|
end
|
135
139
|
end
|
136
140
|
|
137
|
-
all_tweets
|
141
|
+
all_tweets.sort_by { |tweet| -tweet.created_at.to_i }
|
138
142
|
end
|
139
143
|
end
|
140
144
|
end
|
data/lib/version.rb
CHANGED
data/twitterscraper-ruby.gemspec
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: twitterscraper-ruby
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- ts-3156
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-07-
|
11
|
+
date: 2020-07-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -24,6 +24,20 @@ dependencies:
|
|
24
24
|
- - ">="
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: parallel
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
27
41
|
description: A gem to scrape Tweets
|
28
42
|
email:
|
29
43
|
- ts_3156@yahoo.co.jp
|