twitterscraper-ruby 0.4.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +3 -1
- data/lib/twitterscraper/cli.rb +3 -1
- data/lib/twitterscraper/http.rb +0 -1
- data/lib/twitterscraper/proxy.rb +6 -4
- data/lib/twitterscraper/query.rb +23 -19
- data/lib/version.rb +1 -1
- data/twitterscraper-ruby.gemspec +1 -0
- metadata +16 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e6701ff59f3eb13db9e3b2d024ea983264b528194d64f4f03a95f3576338ed77
|
4
|
+
data.tar.gz: 74106816dd406ef1b355b4d4fc94b1baf4509465f6d5bf1ea8f7c654e518eec0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9710fb74c90dcbc17a22dd613cfe4dce75106951f1e55cd9cfa94a825ecf0b6773a2851ff1cca842f83b5207d3744ac63bcce29031061b0a0ac84cc12d62b8a3
|
7
|
+
data.tar.gz: f0e7cd90ecb773a1837be9245b83f51d60f25d48cab716e3584a8d3e1b6f0fe4951eadaf034850599205209d2bf8d7cd4ebfda97f9f862a528c393a2f81887a7
|
data/Gemfile.lock
CHANGED
@@ -1,8 +1,9 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
twitterscraper-ruby (0.
|
4
|
+
twitterscraper-ruby (0.5.0)
|
5
5
|
nokogiri
|
6
|
+
parallel
|
6
7
|
|
7
8
|
GEM
|
8
9
|
remote: https://rubygems.org/
|
@@ -11,6 +12,7 @@ GEM
|
|
11
12
|
minitest (5.14.1)
|
12
13
|
nokogiri (1.10.10)
|
13
14
|
mini_portile2 (~> 2.4.0)
|
15
|
+
parallel (1.19.2)
|
14
16
|
rake (12.3.3)
|
15
17
|
|
16
18
|
PLATFORMS
|
data/lib/twitterscraper/cli.rb
CHANGED
@@ -13,7 +13,8 @@ module Twitterscraper
|
|
13
13
|
def run
|
14
14
|
client = Twitterscraper::Client.new
|
15
15
|
limit = options['limit'] ? options['limit'].to_i : 100
|
16
|
-
|
16
|
+
threads = options['threads'] ? options['threads'].to_i : 2
|
17
|
+
tweets = client.query_tweets(options['query'], limit: limit, threads: threads, start_date: options['start_date'], end_date: options['end_date'])
|
17
18
|
File.write('tweets.json', generate_json(tweets))
|
18
19
|
end
|
19
20
|
|
@@ -36,6 +37,7 @@ module Twitterscraper
|
|
36
37
|
'limit:',
|
37
38
|
'start_date:',
|
38
39
|
'end_date:',
|
40
|
+
'threads:',
|
39
41
|
'pretty',
|
40
42
|
)
|
41
43
|
end
|
data/lib/twitterscraper/http.rb
CHANGED
data/lib/twitterscraper/proxy.rb
CHANGED
@@ -8,7 +8,7 @@ module Twitterscraper
|
|
8
8
|
|
9
9
|
class Result
|
10
10
|
def initialize(items)
|
11
|
-
@items = items
|
11
|
+
@items = items
|
12
12
|
@cur_index = 0
|
13
13
|
end
|
14
14
|
|
@@ -17,7 +17,9 @@ module Twitterscraper
|
|
17
17
|
reload
|
18
18
|
end
|
19
19
|
@cur_index += 1
|
20
|
-
@items[@cur_index - 1]
|
20
|
+
item = @items[@cur_index - 1]
|
21
|
+
Twitterscraper.logger.info("Using proxy #{item}")
|
22
|
+
item
|
21
23
|
end
|
22
24
|
|
23
25
|
def size
|
@@ -27,7 +29,7 @@ module Twitterscraper
|
|
27
29
|
private
|
28
30
|
|
29
31
|
def reload
|
30
|
-
@items = Proxy.get_proxies
|
32
|
+
@items = Proxy.get_proxies
|
31
33
|
@cur_index = 0
|
32
34
|
Twitterscraper.logger.debug "Reload #{proxies.size} proxies"
|
33
35
|
end
|
@@ -50,7 +52,7 @@ module Twitterscraper
|
|
50
52
|
end
|
51
53
|
|
52
54
|
Twitterscraper.logger.debug "Fetch #{proxies.size} proxies"
|
53
|
-
Result.new(proxies)
|
55
|
+
Result.new(proxies.shuffle)
|
54
56
|
rescue => e
|
55
57
|
if (retries -= 1) > 0
|
56
58
|
retry
|
data/lib/twitterscraper/query.rb
CHANGED
@@ -1,7 +1,9 @@
|
|
1
|
+
require 'resolv-replace'
|
1
2
|
require 'net/http'
|
2
3
|
require 'nokogiri'
|
3
4
|
require 'date'
|
4
5
|
require 'json'
|
6
|
+
require 'parallel'
|
5
7
|
|
6
8
|
module Twitterscraper
|
7
9
|
module Query
|
@@ -14,7 +16,6 @@ module Twitterscraper
|
|
14
16
|
'Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16',
|
15
17
|
'Mozilla/5.0 (Windows NT 5.2; RW; rv:7.0a1) Gecko/20091211 SeaMonkey/9.23a1pre',
|
16
18
|
]
|
17
|
-
USER_AGENT = USER_AGENT_LIST.sample
|
18
19
|
|
19
20
|
INIT_URL = 'https://twitter.com/search?f=tweets&vertical=default&q=__QUERY__&l=__LANG__'
|
20
21
|
RELOAD_URL = 'https://twitter.com/i/search/timeline?f=tweets&vertical=' +
|
@@ -59,15 +60,15 @@ module Twitterscraper
|
|
59
60
|
else
|
60
61
|
json_resp = JSON.parse(text)
|
61
62
|
items_html = json_resp['items_html'] || ''
|
62
|
-
logger.
|
63
|
+
logger.warn json_resp['message'] if json_resp['message'] # Sorry, you are rate limited.
|
63
64
|
end
|
64
65
|
|
65
66
|
[items_html, json_resp]
|
66
67
|
end
|
67
68
|
|
68
69
|
def query_single_page(query, lang, pos, from_user = false, headers: [], proxies: [])
|
69
|
-
query = query.gsub(' ', '%20').gsub('#', '%23').gsub(':', '%3A').gsub('&', '%26')
|
70
70
|
logger.info("Querying #{query}")
|
71
|
+
query = query.gsub(' ', '%20').gsub('#', '%23').gsub(':', '%3A').gsub('&', '%26')
|
71
72
|
|
72
73
|
url = build_query_url(query, lang, pos, from_user)
|
73
74
|
logger.debug("Scraping tweets from #{url}")
|
@@ -99,28 +100,31 @@ module Twitterscraper
|
|
99
100
|
raise ':start_date must occur before :end_date.'
|
100
101
|
end
|
101
102
|
|
102
|
-
# TODO parallel
|
103
|
-
|
104
|
-
pos = nil
|
105
|
-
all_tweets = []
|
106
|
-
|
107
103
|
proxies = Twitterscraper::Proxy.get_proxies
|
108
104
|
|
109
|
-
|
110
|
-
|
105
|
+
date_range = start_date.upto(end_date - 1)
|
106
|
+
queries = date_range.map { |date| query + " since:#{date} until:#{date + 1}" }
|
107
|
+
threads = queries.size if threads > queries.size
|
108
|
+
logger.info("Threads #{threads}")
|
109
|
+
|
110
|
+
all_tweets = []
|
111
|
+
mutex = Mutex.new
|
111
112
|
|
112
|
-
|
113
|
-
|
113
|
+
Parallel.each(queries, in_threads: threads) do |query|
|
114
|
+
headers = {'User-Agent': USER_AGENT_LIST.sample, 'X-Requested-With': 'XMLHttpRequest'}
|
115
|
+
logger.info("Headers #{headers}")
|
114
116
|
|
115
|
-
|
117
|
+
pos = nil
|
116
118
|
|
117
119
|
while true
|
118
|
-
new_tweets, new_pos = query_single_page(
|
120
|
+
new_tweets, new_pos = query_single_page(query, lang, pos, headers: headers, proxies: proxies)
|
119
121
|
unless new_tweets.empty?
|
120
|
-
|
121
|
-
|
122
|
+
mutex.synchronize {
|
123
|
+
all_tweets.concat(new_tweets)
|
124
|
+
all_tweets.uniq! { |t| t.tweet_id }
|
125
|
+
}
|
122
126
|
end
|
123
|
-
logger.info("Got #{new_tweets.size} tweets (total #{all_tweets.size})")
|
127
|
+
logger.info("Got #{new_tweets.size} tweets (total #{all_tweets.size}) worker=#{Parallel.worker_number}")
|
124
128
|
|
125
129
|
break unless new_pos
|
126
130
|
break if all_tweets.size >= limit
|
@@ -130,11 +134,11 @@ module Twitterscraper
|
|
130
134
|
|
131
135
|
if all_tweets.size >= limit
|
132
136
|
logger.info("Reached limit #{all_tweets.size}")
|
133
|
-
|
137
|
+
raise Parallel::Break
|
134
138
|
end
|
135
139
|
end
|
136
140
|
|
137
|
-
all_tweets
|
141
|
+
all_tweets.sort_by { |tweet| -tweet.created_at.to_i }
|
138
142
|
end
|
139
143
|
end
|
140
144
|
end
|
data/lib/version.rb
CHANGED
data/twitterscraper-ruby.gemspec
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: twitterscraper-ruby
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- ts-3156
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-07-
|
11
|
+
date: 2020-07-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -24,6 +24,20 @@ dependencies:
|
|
24
24
|
- - ">="
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: parallel
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
27
41
|
description: A gem to scrape Tweets
|
28
42
|
email:
|
29
43
|
- ts_3156@yahoo.co.jp
|