twitterscraper-ruby 0.8.0 → 0.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -1
- data/.rspec +2 -0
- data/Gemfile +1 -0
- data/Gemfile.lock +16 -1
- data/README.md +56 -22
- data/bin/twitterscraper +1 -1
- data/lib/twitterscraper.rb +2 -0
- data/lib/twitterscraper/cache.rb +69 -0
- data/lib/twitterscraper/cli.rb +27 -6
- data/lib/twitterscraper/client.rb +13 -0
- data/lib/twitterscraper/proxy.rb +5 -4
- data/lib/twitterscraper/query.rb +64 -27
- data/lib/twitterscraper/template.rb +48 -0
- data/lib/twitterscraper/tweet.rb +27 -6
- data/lib/version.rb +1 -1
- metadata +5 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: bafdfd47b386ef7f717dc5846102c8a5153f4660e61d3559f6834cdca340c19c
|
4
|
+
data.tar.gz: fb5564629d89ae83c916d868e9fd401fdca1b423fbeb2d6945b0831c0d8ecf11
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5e9c819b318a908c73f56de0a638c7c819cc1e31c867812ec9ffa3e23362318db3dfc1fe5ffde53c4769bffdf1f62efdb4701bf9b1efe874625cdb6ce21ef1bc
|
7
|
+
data.tar.gz: aa75f3a328f6c2c278738962e7d6e9ea747841343362e8a0f226fd76b316b6ab05e63c93e495ae39009fd3f0a1c4eb0657bc66e1b22416e6a695cc34b4059643
|
data/.gitignore
CHANGED
data/.rspec
ADDED
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,19 +1,33 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
twitterscraper-ruby (0.
|
4
|
+
twitterscraper-ruby (0.13.0)
|
5
5
|
nokogiri
|
6
6
|
parallel
|
7
7
|
|
8
8
|
GEM
|
9
9
|
remote: https://rubygems.org/
|
10
10
|
specs:
|
11
|
+
diff-lcs (1.4.4)
|
11
12
|
mini_portile2 (2.4.0)
|
12
13
|
minitest (5.14.1)
|
13
14
|
nokogiri (1.10.10)
|
14
15
|
mini_portile2 (~> 2.4.0)
|
15
16
|
parallel (1.19.2)
|
16
17
|
rake (12.3.3)
|
18
|
+
rspec (3.9.0)
|
19
|
+
rspec-core (~> 3.9.0)
|
20
|
+
rspec-expectations (~> 3.9.0)
|
21
|
+
rspec-mocks (~> 3.9.0)
|
22
|
+
rspec-core (3.9.2)
|
23
|
+
rspec-support (~> 3.9.3)
|
24
|
+
rspec-expectations (3.9.2)
|
25
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
26
|
+
rspec-support (~> 3.9.0)
|
27
|
+
rspec-mocks (3.9.1)
|
28
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
29
|
+
rspec-support (~> 3.9.0)
|
30
|
+
rspec-support (3.9.3)
|
17
31
|
|
18
32
|
PLATFORMS
|
19
33
|
ruby
|
@@ -21,6 +35,7 @@ PLATFORMS
|
|
21
35
|
DEPENDENCIES
|
22
36
|
minitest (~> 5.0)
|
23
37
|
rake (~> 12.0)
|
38
|
+
rspec
|
24
39
|
twitterscraper-ruby!
|
25
40
|
|
26
41
|
BUNDLED WITH
|
data/README.md
CHANGED
@@ -33,7 +33,7 @@ Command-line interface:
|
|
33
33
|
|
34
34
|
```shell script
|
35
35
|
$ twitterscraper --query KEYWORD --start_date 2020-06-01 --end_date 2020-06-30 --lang ja \
|
36
|
-
--limit 100 --threads 10 --proxy --output output.json
|
36
|
+
--limit 100 --threads 10 --proxy --cache --output output.json
|
37
37
|
```
|
38
38
|
|
39
39
|
From Within Ruby:
|
@@ -56,12 +56,60 @@ tweets = client.query_tweets(KEYWORD, options)
|
|
56
56
|
tweets.each do |tweet|
|
57
57
|
puts tweet.tweet_id
|
58
58
|
puts tweet.text
|
59
|
-
puts tweet.created_at
|
60
59
|
puts tweet.tweet_url
|
60
|
+
puts tweet.created_at
|
61
|
+
|
62
|
+
hash = tweet.attrs
|
63
|
+
puts hash.keys
|
61
64
|
end
|
62
65
|
```
|
63
66
|
|
64
67
|
|
68
|
+
## Attributes
|
69
|
+
|
70
|
+
### Tweet
|
71
|
+
|
72
|
+
- screen_name
|
73
|
+
- name
|
74
|
+
- user_id
|
75
|
+
- tweet_id
|
76
|
+
- text
|
77
|
+
- links
|
78
|
+
- hashtags
|
79
|
+
- image_urls
|
80
|
+
- video_url
|
81
|
+
- has_media
|
82
|
+
- likes
|
83
|
+
- retweets
|
84
|
+
- replies
|
85
|
+
- is_replied
|
86
|
+
- is_reply_to
|
87
|
+
- parent_tweet_id
|
88
|
+
- reply_to_users
|
89
|
+
- tweet_url
|
90
|
+
- created_at
|
91
|
+
|
92
|
+
|
93
|
+
## Search operators
|
94
|
+
|
95
|
+
| Operator | Finds Tweets... |
|
96
|
+
| ------------- | ------------- |
|
97
|
+
| watching now | containing both "watching" and "now". This is the default operator. |
|
98
|
+
| "happy hour" | containing the exact phrase "happy hour". |
|
99
|
+
| love OR hate | containing either "love" or "hate" (or both). |
|
100
|
+
| beer -root | containing "beer" but not "root". |
|
101
|
+
| #haiku | containing the hashtag "haiku". |
|
102
|
+
| from:interior | sent from Twitter account "interior". |
|
103
|
+
| to:NASA | a Tweet authored in reply to Twitter account "NASA". |
|
104
|
+
| @NASA | mentioning Twitter account "NASA". |
|
105
|
+
| puppy filter:media | containing "puppy" and an image or video. |
|
106
|
+
| puppy -filter:retweets | containing "puppy", filtering out retweets |
|
107
|
+
| superhero since:2015-12-21 | containing "superhero" and sent since date "2015-12-21" (year-month-day). |
|
108
|
+
| puppy until:2015-12-21 | containing "puppy" and sent before the date "2015-12-21". |
|
109
|
+
|
110
|
+
Search operators documentation is in [Standard search operators](https://developer.twitter.com/en/docs/tweets/rules-and-filtering/overview/standard-operators).
|
111
|
+
|
112
|
+
|
65
113
|
## Examples
|
66
114
|
|
67
115
|
```shell script
|
@@ -79,40 +127,26 @@ $ cat tweets.json | jq . | less
|
|
79
127
|
"tweet_url": "https://twitter.com/screenname/status/1282659891992000000",
|
80
128
|
"created_at": "2020-07-13 12:00:00 +0000",
|
81
129
|
"text": "Thanks Twitter!"
|
82
|
-
}
|
83
|
-
...
|
130
|
+
}
|
84
131
|
]
|
85
132
|
```
|
86
133
|
|
87
|
-
## Attributes
|
88
|
-
|
89
|
-
### Tweet
|
90
|
-
|
91
|
-
- tweet_id
|
92
|
-
- text
|
93
|
-
- user_id
|
94
|
-
- screen_name
|
95
|
-
- name
|
96
|
-
- links
|
97
|
-
- hashtags
|
98
|
-
- image_urls
|
99
|
-
- tweet_url
|
100
|
-
- created_at
|
101
|
-
|
102
|
-
|
103
134
|
## CLI Options
|
104
135
|
|
105
136
|
| Option | Description | Default |
|
106
137
|
| ------------- | ------------- | ------------- |
|
107
138
|
| `-h`, `--help` | This option displays a summary of twitterscraper. | |
|
108
139
|
| `--query` | Specify a keyword used during the search. | |
|
109
|
-
| `--start_date` |
|
110
|
-
| `--end_date` |
|
140
|
+
| `--start_date` | Used as "since:yyyy-mm-dd for your query. This means "since the date". | |
|
141
|
+
| `--end_date` | Used as "until:yyyy-mm-dd for your query. This means "before the date". | |
|
111
142
|
| `--lang` | Retrieve tweets written in a specific language. | |
|
112
143
|
| `--limit` | Stop scraping when *at least* the number of tweets indicated with --limit is scraped. | 100 |
|
113
144
|
| `--threads` | Set the number of threads twitterscraper-ruby should initiate while scraping for your query. | 2 |
|
114
145
|
| `--proxy` | Scrape https://twitter.com/search via proxies. | false |
|
146
|
+
| `--cache` | Enable caching. | false |
|
147
|
+
| `--format` | The format of the output. | json |
|
115
148
|
| `--output` | The name of the output file. | tweets.json |
|
149
|
+
| `--verbose` | Print debug messages. | tweets.json |
|
116
150
|
|
117
151
|
|
118
152
|
## Contributing
|
data/bin/twitterscraper
CHANGED
data/lib/twitterscraper.rb
CHANGED
@@ -2,9 +2,11 @@ require 'twitterscraper/logger'
|
|
2
2
|
require 'twitterscraper/proxy'
|
3
3
|
require 'twitterscraper/http'
|
4
4
|
require 'twitterscraper/lang'
|
5
|
+
require 'twitterscraper/cache'
|
5
6
|
require 'twitterscraper/query'
|
6
7
|
require 'twitterscraper/client'
|
7
8
|
require 'twitterscraper/tweet'
|
9
|
+
require 'twitterscraper/template'
|
8
10
|
require 'version'
|
9
11
|
|
10
12
|
module Twitterscraper
|
@@ -0,0 +1,69 @@
|
|
1
|
+
require 'base64'
|
2
|
+
require 'digest/md5'
|
3
|
+
|
4
|
+
module Twitterscraper
|
5
|
+
class Cache
|
6
|
+
def initialize()
|
7
|
+
@ttl = 3600 # 1 hour
|
8
|
+
@dir = 'cache'
|
9
|
+
Dir.mkdir(@dir) unless File.exist?(@dir)
|
10
|
+
end
|
11
|
+
|
12
|
+
def read(key)
|
13
|
+
key = cache_key(key)
|
14
|
+
file = File.join(@dir, key)
|
15
|
+
entry = Entry.from_json(File.read(file))
|
16
|
+
entry.value if entry.time > Time.now - @ttl
|
17
|
+
rescue Errno::ENOENT => e
|
18
|
+
nil
|
19
|
+
end
|
20
|
+
|
21
|
+
def write(key, value)
|
22
|
+
key = cache_key(key)
|
23
|
+
entry = Entry.new(key, value, Time.now)
|
24
|
+
file = File.join(@dir, key)
|
25
|
+
File.write(file, entry.to_json)
|
26
|
+
end
|
27
|
+
|
28
|
+
def fetch(key, &block)
|
29
|
+
if (value = read(key))
|
30
|
+
value
|
31
|
+
else
|
32
|
+
yield.tap { |v| write(key, v) }
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def cache_key(key)
|
37
|
+
value = key.gsub(':', '%3A').gsub('/', '%2F').gsub('?', '%3F').gsub('=', '%3D').gsub('&', '%26')
|
38
|
+
value = Digest::MD5.hexdigest(value) if value.length >= 100
|
39
|
+
value
|
40
|
+
end
|
41
|
+
|
42
|
+
class Entry < Hash
|
43
|
+
attr_reader :key, :value, :time
|
44
|
+
|
45
|
+
def initialize(key, value, time)
|
46
|
+
@key = key
|
47
|
+
@value = value
|
48
|
+
@time = time
|
49
|
+
end
|
50
|
+
|
51
|
+
def attrs
|
52
|
+
{key: @key, value: @value, time: @time}
|
53
|
+
end
|
54
|
+
|
55
|
+
def to_json
|
56
|
+
hash = attrs
|
57
|
+
hash[:value] = Base64.encode64(hash[:value])
|
58
|
+
hash.to_json
|
59
|
+
end
|
60
|
+
|
61
|
+
class << self
|
62
|
+
def from_json(text)
|
63
|
+
json = JSON.parse(text)
|
64
|
+
new(json['key'], Base64.decode64(json['value']), Time.parse(json['time']))
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
data/lib/twitterscraper/cli.rb
CHANGED
@@ -20,12 +20,24 @@ module Twitterscraper
|
|
20
20
|
end_date: options['end_date'],
|
21
21
|
lang: options['lang'],
|
22
22
|
limit: options['limit'],
|
23
|
+
daily_limit: options['daily_limit'],
|
23
24
|
threads: options['threads'],
|
24
|
-
proxy: options['proxy']
|
25
25
|
}
|
26
|
-
client = Twitterscraper::Client.new
|
26
|
+
client = Twitterscraper::Client.new(cache: options['cache'], proxy: options['proxy'])
|
27
27
|
tweets = client.query_tweets(options['query'], query_options)
|
28
|
-
|
28
|
+
export(tweets) unless tweets.empty?
|
29
|
+
end
|
30
|
+
|
31
|
+
def export(tweets)
|
32
|
+
write_json = lambda { File.write(options['output'], generate_json(tweets)) }
|
33
|
+
|
34
|
+
if options['format'] == 'json'
|
35
|
+
write_json.call
|
36
|
+
elsif options['format'] == 'html'
|
37
|
+
File.write('tweets.html', Template.tweets_embedded_html(tweets))
|
38
|
+
else
|
39
|
+
write_json.call
|
40
|
+
end
|
29
41
|
end
|
30
42
|
|
31
43
|
def generate_json(tweets)
|
@@ -51,17 +63,26 @@ module Twitterscraper
|
|
51
63
|
'end_date:',
|
52
64
|
'lang:',
|
53
65
|
'limit:',
|
66
|
+
'daily_limit:',
|
54
67
|
'threads:',
|
55
68
|
'output:',
|
56
|
-
'
|
69
|
+
'format:',
|
70
|
+
'cache:',
|
71
|
+
'proxy:',
|
57
72
|
'pretty',
|
58
73
|
'verbose',
|
59
74
|
)
|
60
75
|
|
76
|
+
options['start_date'] = Query::OLDEST_DATE if options['start_date'] == 'oldest'
|
61
77
|
options['lang'] ||= ''
|
62
78
|
options['limit'] = (options['limit'] || 100).to_i
|
79
|
+
options['daily_limit'] = options['daily_limit'].to_i if options['daily_limit']
|
63
80
|
options['threads'] = (options['threads'] || 2).to_i
|
64
|
-
options['
|
81
|
+
options['format'] ||= 'json'
|
82
|
+
options['output'] ||= "tweets.#{options['format']}"
|
83
|
+
|
84
|
+
options['cache'] = options['cache'] != 'false'
|
85
|
+
options['proxy'] = options['proxy'] != 'false'
|
65
86
|
|
66
87
|
options
|
67
88
|
end
|
@@ -86,7 +107,7 @@ module Twitterscraper
|
|
86
107
|
end
|
87
108
|
|
88
109
|
def print_version
|
89
|
-
puts "twitterscraper-#{
|
110
|
+
puts "twitterscraper-#{VERSION}"
|
90
111
|
end
|
91
112
|
end
|
92
113
|
end
|
data/lib/twitterscraper/proxy.rb
CHANGED
@@ -17,15 +17,17 @@ module Twitterscraper
|
|
17
17
|
reload
|
18
18
|
end
|
19
19
|
@cur_index += 1
|
20
|
-
|
21
|
-
Twitterscraper.logger.info("Using proxy #{item}")
|
22
|
-
item
|
20
|
+
@items[@cur_index - 1]
|
23
21
|
end
|
24
22
|
|
25
23
|
def size
|
26
24
|
@items.size
|
27
25
|
end
|
28
26
|
|
27
|
+
def empty?
|
28
|
+
@items.empty?
|
29
|
+
end
|
30
|
+
|
29
31
|
private
|
30
32
|
|
31
33
|
def reload
|
@@ -51,7 +53,6 @@ module Twitterscraper
|
|
51
53
|
proxies << ip + ':' + port
|
52
54
|
end
|
53
55
|
|
54
|
-
Twitterscraper.logger.debug "Fetch #{proxies.size} proxies"
|
55
56
|
proxies.shuffle
|
56
57
|
rescue => e
|
57
58
|
if (retries -= 1) > 0
|
data/lib/twitterscraper/query.rb
CHANGED
@@ -44,14 +44,18 @@ module Twitterscraper
|
|
44
44
|
|
45
45
|
def get_single_page(url, headers, proxies, timeout = 6, retries = 30)
|
46
46
|
return nil if stop_requested?
|
47
|
-
|
47
|
+
unless proxies.empty?
|
48
|
+
proxy = proxies.sample
|
49
|
+
logger.info("Using proxy #{proxy}")
|
50
|
+
end
|
51
|
+
Http.get(url, headers, proxy, timeout)
|
48
52
|
rescue => e
|
49
53
|
logger.debug "query_single_page: #{e.inspect}"
|
50
54
|
if (retries -= 1) > 0
|
51
|
-
logger.info
|
55
|
+
logger.info "Retrying... (Attempts left: #{retries - 1})"
|
52
56
|
retry
|
53
57
|
else
|
54
|
-
raise
|
58
|
+
raise Error.new("#{e.inspect} url=#{url}")
|
55
59
|
end
|
56
60
|
end
|
57
61
|
|
@@ -71,14 +75,27 @@ module Twitterscraper
|
|
71
75
|
end
|
72
76
|
|
73
77
|
def query_single_page(query, lang, pos, from_user = false, headers: [], proxies: [])
|
74
|
-
logger.info
|
78
|
+
logger.info "Querying #{query}"
|
75
79
|
query = ERB::Util.url_encode(query)
|
76
80
|
|
77
81
|
url = build_query_url(query, lang, pos, from_user)
|
78
|
-
|
82
|
+
http_request = lambda do
|
83
|
+
logger.debug "Scraping tweets from #{url}"
|
84
|
+
get_single_page(url, headers, proxies)
|
85
|
+
end
|
79
86
|
|
80
|
-
|
81
|
-
|
87
|
+
if cache_enabled?
|
88
|
+
client = Cache.new
|
89
|
+
if (response = client.read(url))
|
90
|
+
logger.debug 'Fetching tweets from cache'
|
91
|
+
else
|
92
|
+
response = http_request.call
|
93
|
+
client.write(url, response) unless stop_requested?
|
94
|
+
end
|
95
|
+
else
|
96
|
+
response = http_request.call
|
97
|
+
end
|
98
|
+
return [], nil if response.nil? || response.empty?
|
82
99
|
|
83
100
|
html, json_resp = parse_single_page(response, pos.nil?)
|
84
101
|
|
@@ -97,35 +114,36 @@ module Twitterscraper
|
|
97
114
|
end
|
98
115
|
end
|
99
116
|
|
100
|
-
OLDEST_DATE = Date.parse('2006-
|
117
|
+
OLDEST_DATE = Date.parse('2006-03-21')
|
101
118
|
|
102
|
-
def validate_options!(
|
119
|
+
def validate_options!(queries, start_date:, end_date:, lang:, limit:, threads:)
|
120
|
+
query = queries[0]
|
103
121
|
if query.nil? || query == ''
|
104
|
-
raise 'Please specify a search query.'
|
122
|
+
raise Error.new('Please specify a search query.')
|
105
123
|
end
|
106
124
|
|
107
125
|
if ERB::Util.url_encode(query).length >= 500
|
108
|
-
raise ':query must be a UTF-8, URL-encoded search query of 500 characters maximum, including operators.'
|
126
|
+
raise Error.new(':query must be a UTF-8, URL-encoded search query of 500 characters maximum, including operators.')
|
109
127
|
end
|
110
128
|
|
111
129
|
if start_date && end_date
|
112
130
|
if start_date == end_date
|
113
|
-
raise 'Please specify different values for :start_date and :end_date.'
|
131
|
+
raise Error.new('Please specify different values for :start_date and :end_date.')
|
114
132
|
elsif start_date > end_date
|
115
|
-
raise ':start_date must occur before :end_date.'
|
133
|
+
raise Error.new(':start_date must occur before :end_date.')
|
116
134
|
end
|
117
135
|
end
|
118
136
|
|
119
137
|
if start_date
|
120
138
|
if start_date < OLDEST_DATE
|
121
|
-
raise ":start_date must be greater than or equal to #{OLDEST_DATE}"
|
139
|
+
raise Error.new(":start_date must be greater than or equal to #{OLDEST_DATE}")
|
122
140
|
end
|
123
141
|
end
|
124
142
|
|
125
143
|
if end_date
|
126
144
|
today = Date.today
|
127
145
|
if end_date > Date.today
|
128
|
-
raise ":end_date must be less than or equal to today(#{today})"
|
146
|
+
raise Error.new(":end_date must be less than or equal to today(#{today})")
|
129
147
|
end
|
130
148
|
end
|
131
149
|
end
|
@@ -143,27 +161,32 @@ module Twitterscraper
|
|
143
161
|
end
|
144
162
|
end
|
145
163
|
|
146
|
-
def main_loop(query, lang, limit, headers, proxies)
|
164
|
+
def main_loop(query, lang, limit, daily_limit, headers, proxies)
|
147
165
|
pos = nil
|
166
|
+
daily_tweets = []
|
148
167
|
|
149
168
|
while true
|
150
169
|
new_tweets, new_pos = query_single_page(query, lang, pos, headers: headers, proxies: proxies)
|
151
170
|
unless new_tweets.empty?
|
171
|
+
daily_tweets.concat(new_tweets)
|
172
|
+
daily_tweets.uniq! { |t| t.tweet_id }
|
173
|
+
|
152
174
|
@mutex.synchronize {
|
153
175
|
@all_tweets.concat(new_tweets)
|
154
176
|
@all_tweets.uniq! { |t| t.tweet_id }
|
155
177
|
}
|
156
178
|
end
|
157
|
-
logger.info
|
179
|
+
logger.info "Got #{new_tweets.size} tweets (total #{@all_tweets.size})"
|
158
180
|
|
159
181
|
break unless new_pos
|
182
|
+
break if daily_limit && daily_tweets.size >= daily_limit
|
160
183
|
break if @all_tweets.size >= limit
|
161
184
|
|
162
185
|
pos = new_pos
|
163
186
|
end
|
164
187
|
|
165
|
-
if @all_tweets.size >= limit
|
166
|
-
logger.
|
188
|
+
if !@stop_requested && @all_tweets.size >= limit
|
189
|
+
logger.warn "The limit you specified has been reached limit=#{limit} tweets=#{@all_tweets.size}"
|
167
190
|
@stop_requested = true
|
168
191
|
end
|
169
192
|
end
|
@@ -172,32 +195,46 @@ module Twitterscraper
|
|
172
195
|
@stop_requested
|
173
196
|
end
|
174
197
|
|
175
|
-
def query_tweets(query, start_date: nil, end_date: nil, lang: '', limit: 100,
|
198
|
+
def query_tweets(query, start_date: nil, end_date: nil, lang: '', limit: 100, daily_limit: nil, threads: 2)
|
176
199
|
start_date = Date.parse(start_date) if start_date && start_date.is_a?(String)
|
177
200
|
end_date = Date.parse(end_date) if end_date && end_date.is_a?(String)
|
178
201
|
queries = build_queries(query, start_date, end_date)
|
179
|
-
|
180
|
-
|
202
|
+
if threads > queries.size
|
203
|
+
logger.warn 'The maximum number of :threads is the number of dates between :start_date and :end_date.'
|
204
|
+
threads = queries.size
|
205
|
+
end
|
206
|
+
if proxy_enabled?
|
207
|
+
proxies = Proxy::Pool.new
|
208
|
+
logger.debug "Fetch #{proxies.size} proxies"
|
209
|
+
else
|
210
|
+
proxies = []
|
211
|
+
logger.debug 'Proxy disabled'
|
212
|
+
end
|
213
|
+
logger.debug "Cache #{cache_enabled? ? 'enabled' : 'disabled'}"
|
181
214
|
|
182
|
-
validate_options!(queries[0], start_date: start_date, end_date: end_date, lang: lang, limit: limit, threads: threads, proxy: proxy)
|
183
215
|
|
184
|
-
|
216
|
+
validate_options!(queries, start_date: start_date, end_date: end_date, lang: lang, limit: limit, threads: threads)
|
217
|
+
|
218
|
+
logger.info "The number of threads #{threads}"
|
185
219
|
|
186
220
|
headers = {'User-Agent': USER_AGENT_LIST.sample, 'X-Requested-With': 'XMLHttpRequest'}
|
187
|
-
logger.info
|
221
|
+
logger.info "Headers #{headers}"
|
188
222
|
|
189
223
|
@all_tweets = []
|
190
224
|
@mutex = Mutex.new
|
191
225
|
@stop_requested = false
|
192
226
|
|
193
227
|
if threads > 1
|
228
|
+
Thread.abort_on_exception = true
|
229
|
+
logger.debug "Set 'Thread.abort_on_exception' to true"
|
230
|
+
|
194
231
|
Parallel.each(queries, in_threads: threads) do |query|
|
195
|
-
main_loop(query, lang, limit, headers, proxies)
|
232
|
+
main_loop(query, lang, limit, daily_limit, headers, proxies)
|
196
233
|
raise Parallel::Break if stop_requested?
|
197
234
|
end
|
198
235
|
else
|
199
236
|
queries.each do |query|
|
200
|
-
main_loop(query, lang, limit, headers, proxies)
|
237
|
+
main_loop(query, lang, limit, daily_limit, headers, proxies)
|
201
238
|
break if stop_requested?
|
202
239
|
end
|
203
240
|
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
module Twitterscraper
|
2
|
+
module Template
|
3
|
+
module_function
|
4
|
+
|
5
|
+
def tweets_embedded_html(tweets)
|
6
|
+
tweets_html = tweets.map { |t| EMBED_TWEET_HTML.sub('__TWEET_URL__', t.tweet_url) }
|
7
|
+
EMBED_TWEETS_HTML.sub('__TWEETS__', tweets_html.join)
|
8
|
+
end
|
9
|
+
|
10
|
+
EMBED_TWEET_HTML = <<~'HTML'
|
11
|
+
<blockquote class="twitter-tweet">
|
12
|
+
<a href="__TWEET_URL__"></a>
|
13
|
+
</blockquote>
|
14
|
+
HTML
|
15
|
+
|
16
|
+
EMBED_TWEETS_HTML = <<~'HTML'
|
17
|
+
<html>
|
18
|
+
<head>
|
19
|
+
<style type=text/css>
|
20
|
+
.twitter-tweet {
|
21
|
+
margin: 30px auto 0 auto !important;
|
22
|
+
}
|
23
|
+
</style>
|
24
|
+
<script>
|
25
|
+
window.twttr = (function(d, s, id) {
|
26
|
+
var js, fjs = d.getElementsByTagName(s)[0], t = window.twttr || {};
|
27
|
+
if (d.getElementById(id)) return t;
|
28
|
+
js = d.createElement(s);
|
29
|
+
js.id = id;
|
30
|
+
js.src = "https://platform.twitter.com/widgets.js";
|
31
|
+
fjs.parentNode.insertBefore(js, fjs);
|
32
|
+
|
33
|
+
t._e = [];
|
34
|
+
t.ready = function(f) {
|
35
|
+
t._e.push(f);
|
36
|
+
};
|
37
|
+
|
38
|
+
return t;
|
39
|
+
}(document, "script", "twitter-wjs"));
|
40
|
+
</script>
|
41
|
+
</head>
|
42
|
+
<body>
|
43
|
+
__TWEETS__
|
44
|
+
</body>
|
45
|
+
</html>
|
46
|
+
HTML
|
47
|
+
end
|
48
|
+
end
|
data/lib/twitterscraper/tweet.rb
CHANGED
@@ -21,6 +21,7 @@ module Twitterscraper
|
|
21
21
|
:parent_tweet_id,
|
22
22
|
:reply_to_users,
|
23
23
|
:tweet_url,
|
24
|
+
:timestamp,
|
24
25
|
:created_at,
|
25
26
|
]
|
26
27
|
attr_reader *KEYS
|
@@ -31,13 +32,25 @@ module Twitterscraper
|
|
31
32
|
end
|
32
33
|
end
|
33
34
|
|
34
|
-
def
|
35
|
+
def attrs
|
35
36
|
KEYS.map do |key|
|
36
37
|
[key, send(key)]
|
37
|
-
end.to_h
|
38
|
+
end.to_h
|
39
|
+
end
|
40
|
+
|
41
|
+
def to_json(options = {})
|
42
|
+
attrs.to_json
|
38
43
|
end
|
39
44
|
|
40
45
|
class << self
|
46
|
+
def from_json(text)
|
47
|
+
json = JSON.parse(text)
|
48
|
+
json.map do |tweet|
|
49
|
+
tweet['created_at'] = Time.parse(tweet['created_at'])
|
50
|
+
new(tweet)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
41
54
|
def from_html(text)
|
42
55
|
html = Nokogiri::HTML(text)
|
43
56
|
from_tweets_html(html.xpath("//li[@class[contains(., 'js-stream-item')]]/div[@class[contains(., 'js-stream-tweet')]]"))
|
@@ -46,12 +59,19 @@ module Twitterscraper
|
|
46
59
|
def from_tweets_html(html)
|
47
60
|
html.map do |tweet|
|
48
61
|
from_tweet_html(tweet)
|
49
|
-
end
|
62
|
+
end.compact
|
50
63
|
end
|
51
64
|
|
52
65
|
def from_tweet_html(html)
|
66
|
+
screen_name = html.attr('data-screen-name')
|
67
|
+
tweet_id = html.attr('data-tweet-id')&.to_i
|
68
|
+
|
69
|
+
unless html.to_s.include?('js-tweet-text-container')
|
70
|
+
Twitterscraper.logger.warn "html doesn't include div.js-tweet-text-container url=https://twitter.com/#{screen_name}/status/#{tweet_id}"
|
71
|
+
return nil
|
72
|
+
end
|
73
|
+
|
53
74
|
inner_html = Nokogiri::HTML(html.inner_html)
|
54
|
-
tweet_id = html.attr('data-tweet-id').to_i
|
55
75
|
text = inner_html.xpath("//div[@class[contains(., 'js-tweet-text-container')]]/p[@class[contains(., 'js-tweet-text')]]").first.text
|
56
76
|
links = inner_html.xpath("//a[@class[contains(., 'twitter-timeline-link')]]").map { |elem| elem.attr('data-expanded-url') }.select { |link| link && !link.include?('pic.twitter') }
|
57
77
|
image_urls = inner_html.xpath("//div[@class[contains(., 'AdaptiveMedia-photoContainer')]]").map { |elem| elem.attr('data-image-url') }
|
@@ -74,9 +94,9 @@ module Twitterscraper
|
|
74
94
|
reply_to_users = inner_html.xpath("//div[@class[contains(., 'ReplyingToContextBelowAuthor')]]/a").map { |user| {screen_name: user.text.delete_prefix('@'), user_id: user.attr('data-user-id')} }
|
75
95
|
end
|
76
96
|
|
77
|
-
timestamp = inner_html.xpath("//span[@class[contains(., '
|
97
|
+
timestamp = inner_html.xpath("//span[@class[contains(., 'js-short-timestamp')]]").first.attr('data-time').to_i
|
78
98
|
new(
|
79
|
-
screen_name:
|
99
|
+
screen_name: screen_name,
|
80
100
|
name: html.attr('data-name'),
|
81
101
|
user_id: html.attr('data-user-id').to_i,
|
82
102
|
tweet_id: tweet_id,
|
@@ -94,6 +114,7 @@ module Twitterscraper
|
|
94
114
|
parent_tweet_id: parent_tweet_id,
|
95
115
|
reply_to_users: reply_to_users,
|
96
116
|
tweet_url: 'https://twitter.com' + html.attr('data-permalink-path'),
|
117
|
+
timestamp: timestamp,
|
97
118
|
created_at: Time.at(timestamp, in: '+00:00'),
|
98
119
|
)
|
99
120
|
end
|
data/lib/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: twitterscraper-ruby
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.13.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- ts-3156
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-07-
|
11
|
+
date: 2020-07-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -48,6 +48,7 @@ extra_rdoc_files: []
|
|
48
48
|
files:
|
49
49
|
- ".gitignore"
|
50
50
|
- ".irbrc"
|
51
|
+
- ".rspec"
|
51
52
|
- ".ruby-version"
|
52
53
|
- ".travis.yml"
|
53
54
|
- CODE_OF_CONDUCT.md
|
@@ -61,6 +62,7 @@ files:
|
|
61
62
|
- bin/twitterscraper
|
62
63
|
- lib/twitterscraper-ruby.rb
|
63
64
|
- lib/twitterscraper.rb
|
65
|
+
- lib/twitterscraper/cache.rb
|
64
66
|
- lib/twitterscraper/cli.rb
|
65
67
|
- lib/twitterscraper/client.rb
|
66
68
|
- lib/twitterscraper/http.rb
|
@@ -68,6 +70,7 @@ files:
|
|
68
70
|
- lib/twitterscraper/logger.rb
|
69
71
|
- lib/twitterscraper/proxy.rb
|
70
72
|
- lib/twitterscraper/query.rb
|
73
|
+
- lib/twitterscraper/template.rb
|
71
74
|
- lib/twitterscraper/tweet.rb
|
72
75
|
- lib/version.rb
|
73
76
|
- twitterscraper-ruby.gemspec
|