twitterscraper-ruby 0.8.0 → 0.13.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -1
- data/.rspec +2 -0
- data/Gemfile +1 -0
- data/Gemfile.lock +16 -1
- data/README.md +56 -22
- data/bin/twitterscraper +1 -1
- data/lib/twitterscraper.rb +2 -0
- data/lib/twitterscraper/cache.rb +69 -0
- data/lib/twitterscraper/cli.rb +27 -6
- data/lib/twitterscraper/client.rb +13 -0
- data/lib/twitterscraper/proxy.rb +5 -4
- data/lib/twitterscraper/query.rb +64 -27
- data/lib/twitterscraper/template.rb +48 -0
- data/lib/twitterscraper/tweet.rb +27 -6
- data/lib/version.rb +1 -1
- metadata +5 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: bafdfd47b386ef7f717dc5846102c8a5153f4660e61d3559f6834cdca340c19c
|
4
|
+
data.tar.gz: fb5564629d89ae83c916d868e9fd401fdca1b423fbeb2d6945b0831c0d8ecf11
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5e9c819b318a908c73f56de0a638c7c819cc1e31c867812ec9ffa3e23362318db3dfc1fe5ffde53c4769bffdf1f62efdb4701bf9b1efe874625cdb6ce21ef1bc
|
7
|
+
data.tar.gz: aa75f3a328f6c2c278738962e7d6e9ea747841343362e8a0f226fd76b316b6ab05e63c93e495ae39009fd3f0a1c4eb0657bc66e1b22416e6a695cc34b4059643
|
data/.gitignore
CHANGED
data/.rspec
ADDED
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,19 +1,33 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
twitterscraper-ruby (0.
|
4
|
+
twitterscraper-ruby (0.13.0)
|
5
5
|
nokogiri
|
6
6
|
parallel
|
7
7
|
|
8
8
|
GEM
|
9
9
|
remote: https://rubygems.org/
|
10
10
|
specs:
|
11
|
+
diff-lcs (1.4.4)
|
11
12
|
mini_portile2 (2.4.0)
|
12
13
|
minitest (5.14.1)
|
13
14
|
nokogiri (1.10.10)
|
14
15
|
mini_portile2 (~> 2.4.0)
|
15
16
|
parallel (1.19.2)
|
16
17
|
rake (12.3.3)
|
18
|
+
rspec (3.9.0)
|
19
|
+
rspec-core (~> 3.9.0)
|
20
|
+
rspec-expectations (~> 3.9.0)
|
21
|
+
rspec-mocks (~> 3.9.0)
|
22
|
+
rspec-core (3.9.2)
|
23
|
+
rspec-support (~> 3.9.3)
|
24
|
+
rspec-expectations (3.9.2)
|
25
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
26
|
+
rspec-support (~> 3.9.0)
|
27
|
+
rspec-mocks (3.9.1)
|
28
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
29
|
+
rspec-support (~> 3.9.0)
|
30
|
+
rspec-support (3.9.3)
|
17
31
|
|
18
32
|
PLATFORMS
|
19
33
|
ruby
|
@@ -21,6 +35,7 @@ PLATFORMS
|
|
21
35
|
DEPENDENCIES
|
22
36
|
minitest (~> 5.0)
|
23
37
|
rake (~> 12.0)
|
38
|
+
rspec
|
24
39
|
twitterscraper-ruby!
|
25
40
|
|
26
41
|
BUNDLED WITH
|
data/README.md
CHANGED
@@ -33,7 +33,7 @@ Command-line interface:
|
|
33
33
|
|
34
34
|
```shell script
|
35
35
|
$ twitterscraper --query KEYWORD --start_date 2020-06-01 --end_date 2020-06-30 --lang ja \
|
36
|
-
--limit 100 --threads 10 --proxy --output output.json
|
36
|
+
--limit 100 --threads 10 --proxy --cache --output output.json
|
37
37
|
```
|
38
38
|
|
39
39
|
From Within Ruby:
|
@@ -56,12 +56,60 @@ tweets = client.query_tweets(KEYWORD, options)
|
|
56
56
|
tweets.each do |tweet|
|
57
57
|
puts tweet.tweet_id
|
58
58
|
puts tweet.text
|
59
|
-
puts tweet.created_at
|
60
59
|
puts tweet.tweet_url
|
60
|
+
puts tweet.created_at
|
61
|
+
|
62
|
+
hash = tweet.attrs
|
63
|
+
puts hash.keys
|
61
64
|
end
|
62
65
|
```
|
63
66
|
|
64
67
|
|
68
|
+
## Attributes
|
69
|
+
|
70
|
+
### Tweet
|
71
|
+
|
72
|
+
- screen_name
|
73
|
+
- name
|
74
|
+
- user_id
|
75
|
+
- tweet_id
|
76
|
+
- text
|
77
|
+
- links
|
78
|
+
- hashtags
|
79
|
+
- image_urls
|
80
|
+
- video_url
|
81
|
+
- has_media
|
82
|
+
- likes
|
83
|
+
- retweets
|
84
|
+
- replies
|
85
|
+
- is_replied
|
86
|
+
- is_reply_to
|
87
|
+
- parent_tweet_id
|
88
|
+
- reply_to_users
|
89
|
+
- tweet_url
|
90
|
+
- created_at
|
91
|
+
|
92
|
+
|
93
|
+
## Search operators
|
94
|
+
|
95
|
+
| Operator | Finds Tweets... |
|
96
|
+
| ------------- | ------------- |
|
97
|
+
| watching now | containing both "watching" and "now". This is the default operator. |
|
98
|
+
| "happy hour" | containing the exact phrase "happy hour". |
|
99
|
+
| love OR hate | containing either "love" or "hate" (or both). |
|
100
|
+
| beer -root | containing "beer" but not "root". |
|
101
|
+
| #haiku | containing the hashtag "haiku". |
|
102
|
+
| from:interior | sent from Twitter account "interior". |
|
103
|
+
| to:NASA | a Tweet authored in reply to Twitter account "NASA". |
|
104
|
+
| @NASA | mentioning Twitter account "NASA". |
|
105
|
+
| puppy filter:media | containing "puppy" and an image or video. |
|
106
|
+
| puppy -filter:retweets | containing "puppy", filtering out retweets |
|
107
|
+
| superhero since:2015-12-21 | containing "superhero" and sent since date "2015-12-21" (year-month-day). |
|
108
|
+
| puppy until:2015-12-21 | containing "puppy" and sent before the date "2015-12-21". |
|
109
|
+
|
110
|
+
Search operators documentation is in [Standard search operators](https://developer.twitter.com/en/docs/tweets/rules-and-filtering/overview/standard-operators).
|
111
|
+
|
112
|
+
|
65
113
|
## Examples
|
66
114
|
|
67
115
|
```shell script
|
@@ -79,40 +127,26 @@ $ cat tweets.json | jq . | less
|
|
79
127
|
"tweet_url": "https://twitter.com/screenname/status/1282659891992000000",
|
80
128
|
"created_at": "2020-07-13 12:00:00 +0000",
|
81
129
|
"text": "Thanks Twitter!"
|
82
|
-
}
|
83
|
-
...
|
130
|
+
}
|
84
131
|
]
|
85
132
|
```
|
86
133
|
|
87
|
-
## Attributes
|
88
|
-
|
89
|
-
### Tweet
|
90
|
-
|
91
|
-
- tweet_id
|
92
|
-
- text
|
93
|
-
- user_id
|
94
|
-
- screen_name
|
95
|
-
- name
|
96
|
-
- links
|
97
|
-
- hashtags
|
98
|
-
- image_urls
|
99
|
-
- tweet_url
|
100
|
-
- created_at
|
101
|
-
|
102
|
-
|
103
134
|
## CLI Options
|
104
135
|
|
105
136
|
| Option | Description | Default |
|
106
137
|
| ------------- | ------------- | ------------- |
|
107
138
|
| `-h`, `--help` | This option displays a summary of twitterscraper. | |
|
108
139
|
| `--query` | Specify a keyword used during the search. | |
|
109
|
-
| `--start_date` |
|
110
|
-
| `--end_date` |
|
140
|
+
| `--start_date` | Used as "since:yyyy-mm-dd for your query. This means "since the date". | |
|
141
|
+
| `--end_date` | Used as "until:yyyy-mm-dd for your query. This means "before the date". | |
|
111
142
|
| `--lang` | Retrieve tweets written in a specific language. | |
|
112
143
|
| `--limit` | Stop scraping when *at least* the number of tweets indicated with --limit is scraped. | 100 |
|
113
144
|
| `--threads` | Set the number of threads twitterscraper-ruby should initiate while scraping for your query. | 2 |
|
114
145
|
| `--proxy` | Scrape https://twitter.com/search via proxies. | false |
|
146
|
+
| `--cache` | Enable caching. | false |
|
147
|
+
| `--format` | The format of the output. | json |
|
115
148
|
| `--output` | The name of the output file. | tweets.json |
|
149
|
+
| `--verbose` | Print debug messages. | tweets.json |
|
116
150
|
|
117
151
|
|
118
152
|
## Contributing
|
data/bin/twitterscraper
CHANGED
data/lib/twitterscraper.rb
CHANGED
@@ -2,9 +2,11 @@ require 'twitterscraper/logger'
|
|
2
2
|
require 'twitterscraper/proxy'
|
3
3
|
require 'twitterscraper/http'
|
4
4
|
require 'twitterscraper/lang'
|
5
|
+
require 'twitterscraper/cache'
|
5
6
|
require 'twitterscraper/query'
|
6
7
|
require 'twitterscraper/client'
|
7
8
|
require 'twitterscraper/tweet'
|
9
|
+
require 'twitterscraper/template'
|
8
10
|
require 'version'
|
9
11
|
|
10
12
|
module Twitterscraper
|
@@ -0,0 +1,69 @@
|
|
1
|
+
require 'base64'
|
2
|
+
require 'digest/md5'
|
3
|
+
|
4
|
+
module Twitterscraper
|
5
|
+
class Cache
|
6
|
+
def initialize()
|
7
|
+
@ttl = 3600 # 1 hour
|
8
|
+
@dir = 'cache'
|
9
|
+
Dir.mkdir(@dir) unless File.exist?(@dir)
|
10
|
+
end
|
11
|
+
|
12
|
+
def read(key)
|
13
|
+
key = cache_key(key)
|
14
|
+
file = File.join(@dir, key)
|
15
|
+
entry = Entry.from_json(File.read(file))
|
16
|
+
entry.value if entry.time > Time.now - @ttl
|
17
|
+
rescue Errno::ENOENT => e
|
18
|
+
nil
|
19
|
+
end
|
20
|
+
|
21
|
+
def write(key, value)
|
22
|
+
key = cache_key(key)
|
23
|
+
entry = Entry.new(key, value, Time.now)
|
24
|
+
file = File.join(@dir, key)
|
25
|
+
File.write(file, entry.to_json)
|
26
|
+
end
|
27
|
+
|
28
|
+
def fetch(key, &block)
|
29
|
+
if (value = read(key))
|
30
|
+
value
|
31
|
+
else
|
32
|
+
yield.tap { |v| write(key, v) }
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def cache_key(key)
|
37
|
+
value = key.gsub(':', '%3A').gsub('/', '%2F').gsub('?', '%3F').gsub('=', '%3D').gsub('&', '%26')
|
38
|
+
value = Digest::MD5.hexdigest(value) if value.length >= 100
|
39
|
+
value
|
40
|
+
end
|
41
|
+
|
42
|
+
class Entry < Hash
|
43
|
+
attr_reader :key, :value, :time
|
44
|
+
|
45
|
+
def initialize(key, value, time)
|
46
|
+
@key = key
|
47
|
+
@value = value
|
48
|
+
@time = time
|
49
|
+
end
|
50
|
+
|
51
|
+
def attrs
|
52
|
+
{key: @key, value: @value, time: @time}
|
53
|
+
end
|
54
|
+
|
55
|
+
def to_json
|
56
|
+
hash = attrs
|
57
|
+
hash[:value] = Base64.encode64(hash[:value])
|
58
|
+
hash.to_json
|
59
|
+
end
|
60
|
+
|
61
|
+
class << self
|
62
|
+
def from_json(text)
|
63
|
+
json = JSON.parse(text)
|
64
|
+
new(json['key'], Base64.decode64(json['value']), Time.parse(json['time']))
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
data/lib/twitterscraper/cli.rb
CHANGED
@@ -20,12 +20,24 @@ module Twitterscraper
|
|
20
20
|
end_date: options['end_date'],
|
21
21
|
lang: options['lang'],
|
22
22
|
limit: options['limit'],
|
23
|
+
daily_limit: options['daily_limit'],
|
23
24
|
threads: options['threads'],
|
24
|
-
proxy: options['proxy']
|
25
25
|
}
|
26
|
-
client = Twitterscraper::Client.new
|
26
|
+
client = Twitterscraper::Client.new(cache: options['cache'], proxy: options['proxy'])
|
27
27
|
tweets = client.query_tweets(options['query'], query_options)
|
28
|
-
|
28
|
+
export(tweets) unless tweets.empty?
|
29
|
+
end
|
30
|
+
|
31
|
+
def export(tweets)
|
32
|
+
write_json = lambda { File.write(options['output'], generate_json(tweets)) }
|
33
|
+
|
34
|
+
if options['format'] == 'json'
|
35
|
+
write_json.call
|
36
|
+
elsif options['format'] == 'html'
|
37
|
+
File.write('tweets.html', Template.tweets_embedded_html(tweets))
|
38
|
+
else
|
39
|
+
write_json.call
|
40
|
+
end
|
29
41
|
end
|
30
42
|
|
31
43
|
def generate_json(tweets)
|
@@ -51,17 +63,26 @@ module Twitterscraper
|
|
51
63
|
'end_date:',
|
52
64
|
'lang:',
|
53
65
|
'limit:',
|
66
|
+
'daily_limit:',
|
54
67
|
'threads:',
|
55
68
|
'output:',
|
56
|
-
'
|
69
|
+
'format:',
|
70
|
+
'cache:',
|
71
|
+
'proxy:',
|
57
72
|
'pretty',
|
58
73
|
'verbose',
|
59
74
|
)
|
60
75
|
|
76
|
+
options['start_date'] = Query::OLDEST_DATE if options['start_date'] == 'oldest'
|
61
77
|
options['lang'] ||= ''
|
62
78
|
options['limit'] = (options['limit'] || 100).to_i
|
79
|
+
options['daily_limit'] = options['daily_limit'].to_i if options['daily_limit']
|
63
80
|
options['threads'] = (options['threads'] || 2).to_i
|
64
|
-
options['
|
81
|
+
options['format'] ||= 'json'
|
82
|
+
options['output'] ||= "tweets.#{options['format']}"
|
83
|
+
|
84
|
+
options['cache'] = options['cache'] != 'false'
|
85
|
+
options['proxy'] = options['proxy'] != 'false'
|
65
86
|
|
66
87
|
options
|
67
88
|
end
|
@@ -86,7 +107,7 @@ module Twitterscraper
|
|
86
107
|
end
|
87
108
|
|
88
109
|
def print_version
|
89
|
-
puts "twitterscraper-#{
|
110
|
+
puts "twitterscraper-#{VERSION}"
|
90
111
|
end
|
91
112
|
end
|
92
113
|
end
|
data/lib/twitterscraper/proxy.rb
CHANGED
@@ -17,15 +17,17 @@ module Twitterscraper
|
|
17
17
|
reload
|
18
18
|
end
|
19
19
|
@cur_index += 1
|
20
|
-
|
21
|
-
Twitterscraper.logger.info("Using proxy #{item}")
|
22
|
-
item
|
20
|
+
@items[@cur_index - 1]
|
23
21
|
end
|
24
22
|
|
25
23
|
def size
|
26
24
|
@items.size
|
27
25
|
end
|
28
26
|
|
27
|
+
def empty?
|
28
|
+
@items.empty?
|
29
|
+
end
|
30
|
+
|
29
31
|
private
|
30
32
|
|
31
33
|
def reload
|
@@ -51,7 +53,6 @@ module Twitterscraper
|
|
51
53
|
proxies << ip + ':' + port
|
52
54
|
end
|
53
55
|
|
54
|
-
Twitterscraper.logger.debug "Fetch #{proxies.size} proxies"
|
55
56
|
proxies.shuffle
|
56
57
|
rescue => e
|
57
58
|
if (retries -= 1) > 0
|
data/lib/twitterscraper/query.rb
CHANGED
@@ -44,14 +44,18 @@ module Twitterscraper
|
|
44
44
|
|
45
45
|
def get_single_page(url, headers, proxies, timeout = 6, retries = 30)
|
46
46
|
return nil if stop_requested?
|
47
|
-
|
47
|
+
unless proxies.empty?
|
48
|
+
proxy = proxies.sample
|
49
|
+
logger.info("Using proxy #{proxy}")
|
50
|
+
end
|
51
|
+
Http.get(url, headers, proxy, timeout)
|
48
52
|
rescue => e
|
49
53
|
logger.debug "query_single_page: #{e.inspect}"
|
50
54
|
if (retries -= 1) > 0
|
51
|
-
logger.info
|
55
|
+
logger.info "Retrying... (Attempts left: #{retries - 1})"
|
52
56
|
retry
|
53
57
|
else
|
54
|
-
raise
|
58
|
+
raise Error.new("#{e.inspect} url=#{url}")
|
55
59
|
end
|
56
60
|
end
|
57
61
|
|
@@ -71,14 +75,27 @@ module Twitterscraper
|
|
71
75
|
end
|
72
76
|
|
73
77
|
def query_single_page(query, lang, pos, from_user = false, headers: [], proxies: [])
|
74
|
-
logger.info
|
78
|
+
logger.info "Querying #{query}"
|
75
79
|
query = ERB::Util.url_encode(query)
|
76
80
|
|
77
81
|
url = build_query_url(query, lang, pos, from_user)
|
78
|
-
|
82
|
+
http_request = lambda do
|
83
|
+
logger.debug "Scraping tweets from #{url}"
|
84
|
+
get_single_page(url, headers, proxies)
|
85
|
+
end
|
79
86
|
|
80
|
-
|
81
|
-
|
87
|
+
if cache_enabled?
|
88
|
+
client = Cache.new
|
89
|
+
if (response = client.read(url))
|
90
|
+
logger.debug 'Fetching tweets from cache'
|
91
|
+
else
|
92
|
+
response = http_request.call
|
93
|
+
client.write(url, response) unless stop_requested?
|
94
|
+
end
|
95
|
+
else
|
96
|
+
response = http_request.call
|
97
|
+
end
|
98
|
+
return [], nil if response.nil? || response.empty?
|
82
99
|
|
83
100
|
html, json_resp = parse_single_page(response, pos.nil?)
|
84
101
|
|
@@ -97,35 +114,36 @@ module Twitterscraper
|
|
97
114
|
end
|
98
115
|
end
|
99
116
|
|
100
|
-
OLDEST_DATE = Date.parse('2006-
|
117
|
+
OLDEST_DATE = Date.parse('2006-03-21')
|
101
118
|
|
102
|
-
def validate_options!(
|
119
|
+
def validate_options!(queries, start_date:, end_date:, lang:, limit:, threads:)
|
120
|
+
query = queries[0]
|
103
121
|
if query.nil? || query == ''
|
104
|
-
raise 'Please specify a search query.'
|
122
|
+
raise Error.new('Please specify a search query.')
|
105
123
|
end
|
106
124
|
|
107
125
|
if ERB::Util.url_encode(query).length >= 500
|
108
|
-
raise ':query must be a UTF-8, URL-encoded search query of 500 characters maximum, including operators.'
|
126
|
+
raise Error.new(':query must be a UTF-8, URL-encoded search query of 500 characters maximum, including operators.')
|
109
127
|
end
|
110
128
|
|
111
129
|
if start_date && end_date
|
112
130
|
if start_date == end_date
|
113
|
-
raise 'Please specify different values for :start_date and :end_date.'
|
131
|
+
raise Error.new('Please specify different values for :start_date and :end_date.')
|
114
132
|
elsif start_date > end_date
|
115
|
-
raise ':start_date must occur before :end_date.'
|
133
|
+
raise Error.new(':start_date must occur before :end_date.')
|
116
134
|
end
|
117
135
|
end
|
118
136
|
|
119
137
|
if start_date
|
120
138
|
if start_date < OLDEST_DATE
|
121
|
-
raise ":start_date must be greater than or equal to #{OLDEST_DATE}"
|
139
|
+
raise Error.new(":start_date must be greater than or equal to #{OLDEST_DATE}")
|
122
140
|
end
|
123
141
|
end
|
124
142
|
|
125
143
|
if end_date
|
126
144
|
today = Date.today
|
127
145
|
if end_date > Date.today
|
128
|
-
raise ":end_date must be less than or equal to today(#{today})"
|
146
|
+
raise Error.new(":end_date must be less than or equal to today(#{today})")
|
129
147
|
end
|
130
148
|
end
|
131
149
|
end
|
@@ -143,27 +161,32 @@ module Twitterscraper
|
|
143
161
|
end
|
144
162
|
end
|
145
163
|
|
146
|
-
def main_loop(query, lang, limit, headers, proxies)
|
164
|
+
def main_loop(query, lang, limit, daily_limit, headers, proxies)
|
147
165
|
pos = nil
|
166
|
+
daily_tweets = []
|
148
167
|
|
149
168
|
while true
|
150
169
|
new_tweets, new_pos = query_single_page(query, lang, pos, headers: headers, proxies: proxies)
|
151
170
|
unless new_tweets.empty?
|
171
|
+
daily_tweets.concat(new_tweets)
|
172
|
+
daily_tweets.uniq! { |t| t.tweet_id }
|
173
|
+
|
152
174
|
@mutex.synchronize {
|
153
175
|
@all_tweets.concat(new_tweets)
|
154
176
|
@all_tweets.uniq! { |t| t.tweet_id }
|
155
177
|
}
|
156
178
|
end
|
157
|
-
logger.info
|
179
|
+
logger.info "Got #{new_tweets.size} tweets (total #{@all_tweets.size})"
|
158
180
|
|
159
181
|
break unless new_pos
|
182
|
+
break if daily_limit && daily_tweets.size >= daily_limit
|
160
183
|
break if @all_tweets.size >= limit
|
161
184
|
|
162
185
|
pos = new_pos
|
163
186
|
end
|
164
187
|
|
165
|
-
if @all_tweets.size >= limit
|
166
|
-
logger.
|
188
|
+
if !@stop_requested && @all_tweets.size >= limit
|
189
|
+
logger.warn "The limit you specified has been reached limit=#{limit} tweets=#{@all_tweets.size}"
|
167
190
|
@stop_requested = true
|
168
191
|
end
|
169
192
|
end
|
@@ -172,32 +195,46 @@ module Twitterscraper
|
|
172
195
|
@stop_requested
|
173
196
|
end
|
174
197
|
|
175
|
-
def query_tweets(query, start_date: nil, end_date: nil, lang: '', limit: 100,
|
198
|
+
def query_tweets(query, start_date: nil, end_date: nil, lang: '', limit: 100, daily_limit: nil, threads: 2)
|
176
199
|
start_date = Date.parse(start_date) if start_date && start_date.is_a?(String)
|
177
200
|
end_date = Date.parse(end_date) if end_date && end_date.is_a?(String)
|
178
201
|
queries = build_queries(query, start_date, end_date)
|
179
|
-
|
180
|
-
|
202
|
+
if threads > queries.size
|
203
|
+
logger.warn 'The maximum number of :threads is the number of dates between :start_date and :end_date.'
|
204
|
+
threads = queries.size
|
205
|
+
end
|
206
|
+
if proxy_enabled?
|
207
|
+
proxies = Proxy::Pool.new
|
208
|
+
logger.debug "Fetch #{proxies.size} proxies"
|
209
|
+
else
|
210
|
+
proxies = []
|
211
|
+
logger.debug 'Proxy disabled'
|
212
|
+
end
|
213
|
+
logger.debug "Cache #{cache_enabled? ? 'enabled' : 'disabled'}"
|
181
214
|
|
182
|
-
validate_options!(queries[0], start_date: start_date, end_date: end_date, lang: lang, limit: limit, threads: threads, proxy: proxy)
|
183
215
|
|
184
|
-
|
216
|
+
validate_options!(queries, start_date: start_date, end_date: end_date, lang: lang, limit: limit, threads: threads)
|
217
|
+
|
218
|
+
logger.info "The number of threads #{threads}"
|
185
219
|
|
186
220
|
headers = {'User-Agent': USER_AGENT_LIST.sample, 'X-Requested-With': 'XMLHttpRequest'}
|
187
|
-
logger.info
|
221
|
+
logger.info "Headers #{headers}"
|
188
222
|
|
189
223
|
@all_tweets = []
|
190
224
|
@mutex = Mutex.new
|
191
225
|
@stop_requested = false
|
192
226
|
|
193
227
|
if threads > 1
|
228
|
+
Thread.abort_on_exception = true
|
229
|
+
logger.debug "Set 'Thread.abort_on_exception' to true"
|
230
|
+
|
194
231
|
Parallel.each(queries, in_threads: threads) do |query|
|
195
|
-
main_loop(query, lang, limit, headers, proxies)
|
232
|
+
main_loop(query, lang, limit, daily_limit, headers, proxies)
|
196
233
|
raise Parallel::Break if stop_requested?
|
197
234
|
end
|
198
235
|
else
|
199
236
|
queries.each do |query|
|
200
|
-
main_loop(query, lang, limit, headers, proxies)
|
237
|
+
main_loop(query, lang, limit, daily_limit, headers, proxies)
|
201
238
|
break if stop_requested?
|
202
239
|
end
|
203
240
|
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
module Twitterscraper
|
2
|
+
module Template
|
3
|
+
module_function
|
4
|
+
|
5
|
+
def tweets_embedded_html(tweets)
|
6
|
+
tweets_html = tweets.map { |t| EMBED_TWEET_HTML.sub('__TWEET_URL__', t.tweet_url) }
|
7
|
+
EMBED_TWEETS_HTML.sub('__TWEETS__', tweets_html.join)
|
8
|
+
end
|
9
|
+
|
10
|
+
EMBED_TWEET_HTML = <<~'HTML'
|
11
|
+
<blockquote class="twitter-tweet">
|
12
|
+
<a href="__TWEET_URL__"></a>
|
13
|
+
</blockquote>
|
14
|
+
HTML
|
15
|
+
|
16
|
+
EMBED_TWEETS_HTML = <<~'HTML'
|
17
|
+
<html>
|
18
|
+
<head>
|
19
|
+
<style type=text/css>
|
20
|
+
.twitter-tweet {
|
21
|
+
margin: 30px auto 0 auto !important;
|
22
|
+
}
|
23
|
+
</style>
|
24
|
+
<script>
|
25
|
+
window.twttr = (function(d, s, id) {
|
26
|
+
var js, fjs = d.getElementsByTagName(s)[0], t = window.twttr || {};
|
27
|
+
if (d.getElementById(id)) return t;
|
28
|
+
js = d.createElement(s);
|
29
|
+
js.id = id;
|
30
|
+
js.src = "https://platform.twitter.com/widgets.js";
|
31
|
+
fjs.parentNode.insertBefore(js, fjs);
|
32
|
+
|
33
|
+
t._e = [];
|
34
|
+
t.ready = function(f) {
|
35
|
+
t._e.push(f);
|
36
|
+
};
|
37
|
+
|
38
|
+
return t;
|
39
|
+
}(document, "script", "twitter-wjs"));
|
40
|
+
</script>
|
41
|
+
</head>
|
42
|
+
<body>
|
43
|
+
__TWEETS__
|
44
|
+
</body>
|
45
|
+
</html>
|
46
|
+
HTML
|
47
|
+
end
|
48
|
+
end
|
data/lib/twitterscraper/tweet.rb
CHANGED
@@ -21,6 +21,7 @@ module Twitterscraper
|
|
21
21
|
:parent_tweet_id,
|
22
22
|
:reply_to_users,
|
23
23
|
:tweet_url,
|
24
|
+
:timestamp,
|
24
25
|
:created_at,
|
25
26
|
]
|
26
27
|
attr_reader *KEYS
|
@@ -31,13 +32,25 @@ module Twitterscraper
|
|
31
32
|
end
|
32
33
|
end
|
33
34
|
|
34
|
-
def
|
35
|
+
def attrs
|
35
36
|
KEYS.map do |key|
|
36
37
|
[key, send(key)]
|
37
|
-
end.to_h
|
38
|
+
end.to_h
|
39
|
+
end
|
40
|
+
|
41
|
+
def to_json(options = {})
|
42
|
+
attrs.to_json
|
38
43
|
end
|
39
44
|
|
40
45
|
class << self
|
46
|
+
def from_json(text)
|
47
|
+
json = JSON.parse(text)
|
48
|
+
json.map do |tweet|
|
49
|
+
tweet['created_at'] = Time.parse(tweet['created_at'])
|
50
|
+
new(tweet)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
41
54
|
def from_html(text)
|
42
55
|
html = Nokogiri::HTML(text)
|
43
56
|
from_tweets_html(html.xpath("//li[@class[contains(., 'js-stream-item')]]/div[@class[contains(., 'js-stream-tweet')]]"))
|
@@ -46,12 +59,19 @@ module Twitterscraper
|
|
46
59
|
def from_tweets_html(html)
|
47
60
|
html.map do |tweet|
|
48
61
|
from_tweet_html(tweet)
|
49
|
-
end
|
62
|
+
end.compact
|
50
63
|
end
|
51
64
|
|
52
65
|
def from_tweet_html(html)
|
66
|
+
screen_name = html.attr('data-screen-name')
|
67
|
+
tweet_id = html.attr('data-tweet-id')&.to_i
|
68
|
+
|
69
|
+
unless html.to_s.include?('js-tweet-text-container')
|
70
|
+
Twitterscraper.logger.warn "html doesn't include div.js-tweet-text-container url=https://twitter.com/#{screen_name}/status/#{tweet_id}"
|
71
|
+
return nil
|
72
|
+
end
|
73
|
+
|
53
74
|
inner_html = Nokogiri::HTML(html.inner_html)
|
54
|
-
tweet_id = html.attr('data-tweet-id').to_i
|
55
75
|
text = inner_html.xpath("//div[@class[contains(., 'js-tweet-text-container')]]/p[@class[contains(., 'js-tweet-text')]]").first.text
|
56
76
|
links = inner_html.xpath("//a[@class[contains(., 'twitter-timeline-link')]]").map { |elem| elem.attr('data-expanded-url') }.select { |link| link && !link.include?('pic.twitter') }
|
57
77
|
image_urls = inner_html.xpath("//div[@class[contains(., 'AdaptiveMedia-photoContainer')]]").map { |elem| elem.attr('data-image-url') }
|
@@ -74,9 +94,9 @@ module Twitterscraper
|
|
74
94
|
reply_to_users = inner_html.xpath("//div[@class[contains(., 'ReplyingToContextBelowAuthor')]]/a").map { |user| {screen_name: user.text.delete_prefix('@'), user_id: user.attr('data-user-id')} }
|
75
95
|
end
|
76
96
|
|
77
|
-
timestamp = inner_html.xpath("//span[@class[contains(., '
|
97
|
+
timestamp = inner_html.xpath("//span[@class[contains(., 'js-short-timestamp')]]").first.attr('data-time').to_i
|
78
98
|
new(
|
79
|
-
screen_name:
|
99
|
+
screen_name: screen_name,
|
80
100
|
name: html.attr('data-name'),
|
81
101
|
user_id: html.attr('data-user-id').to_i,
|
82
102
|
tweet_id: tweet_id,
|
@@ -94,6 +114,7 @@ module Twitterscraper
|
|
94
114
|
parent_tweet_id: parent_tweet_id,
|
95
115
|
reply_to_users: reply_to_users,
|
96
116
|
tweet_url: 'https://twitter.com' + html.attr('data-permalink-path'),
|
117
|
+
timestamp: timestamp,
|
97
118
|
created_at: Time.at(timestamp, in: '+00:00'),
|
98
119
|
)
|
99
120
|
end
|
data/lib/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: twitterscraper-ruby
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.13.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- ts-3156
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-07-
|
11
|
+
date: 2020-07-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -48,6 +48,7 @@ extra_rdoc_files: []
|
|
48
48
|
files:
|
49
49
|
- ".gitignore"
|
50
50
|
- ".irbrc"
|
51
|
+
- ".rspec"
|
51
52
|
- ".ruby-version"
|
52
53
|
- ".travis.yml"
|
53
54
|
- CODE_OF_CONDUCT.md
|
@@ -61,6 +62,7 @@ files:
|
|
61
62
|
- bin/twitterscraper
|
62
63
|
- lib/twitterscraper-ruby.rb
|
63
64
|
- lib/twitterscraper.rb
|
65
|
+
- lib/twitterscraper/cache.rb
|
64
66
|
- lib/twitterscraper/cli.rb
|
65
67
|
- lib/twitterscraper/client.rb
|
66
68
|
- lib/twitterscraper/http.rb
|
@@ -68,6 +70,7 @@ files:
|
|
68
70
|
- lib/twitterscraper/logger.rb
|
69
71
|
- lib/twitterscraper/proxy.rb
|
70
72
|
- lib/twitterscraper/query.rb
|
73
|
+
- lib/twitterscraper/template.rb
|
71
74
|
- lib/twitterscraper/tweet.rb
|
72
75
|
- lib/version.rb
|
73
76
|
- twitterscraper-ruby.gemspec
|