twitterscraper-ruby 0.7.0 → 0.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -1
- data/Gemfile.lock +1 -1
- data/README.md +56 -19
- data/bin/twitterscraper +1 -1
- data/lib/twitterscraper.rb +2 -0
- data/lib/twitterscraper/cache.rb +69 -0
- data/lib/twitterscraper/cli.rb +23 -4
- data/lib/twitterscraper/client.rb +8 -0
- data/lib/twitterscraper/proxy.rb +5 -4
- data/lib/twitterscraper/query.rb +50 -24
- data/lib/twitterscraper/template.rb +48 -0
- data/lib/twitterscraper/tweet.rb +83 -7
- data/lib/version.rb +1 -1
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e98afb0444b724e0c9c29f6b888c017166859d1252337f34686526060ca8368d
|
4
|
+
data.tar.gz: 5ef3ff7f86d9a0c9dd1883d55498049d0f164aa7c71a7c9c2bbf0a89ae9bb32c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 04ef61c57545cbbdbbe5da53d1f24cf064b7d1c61ad3da9bc57a361d24ed24480c4f68fa1fea67345ceff4d4d4685f046a4586f55ebe8f3dc0ca6332c7c2d928
|
7
|
+
data.tar.gz: f5fd19c8289c7caf574dc78f754ba9aaf9446f3819b394d14414909b1505e0f9b25181802448d28285de8db81a27e12e0e65d1e1a0b2b0e5df8e7e73d6263e14
|
data/.gitignore
CHANGED
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -33,7 +33,7 @@ Command-line interface:
|
|
33
33
|
|
34
34
|
```shell script
|
35
35
|
$ twitterscraper --query KEYWORD --start_date 2020-06-01 --end_date 2020-06-30 --lang ja \
|
36
|
-
--limit 100 --threads 10 --proxy --output output.json
|
36
|
+
--limit 100 --threads 10 --proxy --cache --output output.json
|
37
37
|
```
|
38
38
|
|
39
39
|
From Within Ruby:
|
@@ -56,12 +56,60 @@ tweets = client.query_tweets(KEYWORD, options)
|
|
56
56
|
tweets.each do |tweet|
|
57
57
|
puts tweet.tweet_id
|
58
58
|
puts tweet.text
|
59
|
-
puts tweet.created_at
|
60
59
|
puts tweet.tweet_url
|
60
|
+
puts tweet.created_at
|
61
|
+
|
62
|
+
hash = tweet.attrs
|
63
|
+
puts hash.keys
|
61
64
|
end
|
62
65
|
```
|
63
66
|
|
64
67
|
|
68
|
+
## Attributes
|
69
|
+
|
70
|
+
### Tweet
|
71
|
+
|
72
|
+
- screen_name
|
73
|
+
- name
|
74
|
+
- user_id
|
75
|
+
- tweet_id
|
76
|
+
- text
|
77
|
+
- links
|
78
|
+
- hashtags
|
79
|
+
- image_urls
|
80
|
+
- video_url
|
81
|
+
- has_media
|
82
|
+
- likes
|
83
|
+
- retweets
|
84
|
+
- replies
|
85
|
+
- is_replied
|
86
|
+
- is_reply_to
|
87
|
+
- parent_tweet_id
|
88
|
+
- reply_to_users
|
89
|
+
- tweet_url
|
90
|
+
- created_at
|
91
|
+
|
92
|
+
|
93
|
+
## Search operators
|
94
|
+
|
95
|
+
| Operator | Finds Tweets... |
|
96
|
+
| ------------- | ------------- |
|
97
|
+
| watching now | containing both "watching" and "now". This is the default operator. |
|
98
|
+
| "happy hour" | containing the exact phrase "happy hour". |
|
99
|
+
| love OR hate | containing either "love" or "hate" (or both). |
|
100
|
+
| beer -root | containing "beer" but not "root". |
|
101
|
+
| #haiku | containing the hashtag "haiku". |
|
102
|
+
| from:interior | sent from Twitter account "interior". |
|
103
|
+
| to:NASA | a Tweet authored in reply to Twitter account "NASA". |
|
104
|
+
| @NASA | mentioning Twitter account "NASA". |
|
105
|
+
| puppy filter:media | containing "puppy" and an image or video. |
|
106
|
+
| puppy -filter:retweets | containing "puppy", filtering out retweets |
|
107
|
+
| superhero since:2015-12-21 | containing "superhero" and sent since date "2015-12-21" (year-month-day). |
|
108
|
+
| puppy until:2015-12-21 | containing "puppy" and sent before the date "2015-12-21". |
|
109
|
+
|
110
|
+
Search operators documentation is in [Standard search operators](https://developer.twitter.com/en/docs/tweets/rules-and-filtering/overview/standard-operators).
|
111
|
+
|
112
|
+
|
65
113
|
## Examples
|
66
114
|
|
67
115
|
```shell script
|
@@ -79,37 +127,26 @@ $ cat tweets.json | jq . | less
|
|
79
127
|
"tweet_url": "https://twitter.com/screenname/status/1282659891992000000",
|
80
128
|
"created_at": "2020-07-13 12:00:00 +0000",
|
81
129
|
"text": "Thanks Twitter!"
|
82
|
-
}
|
83
|
-
...
|
130
|
+
}
|
84
131
|
]
|
85
132
|
```
|
86
133
|
|
87
|
-
## Attributes
|
88
|
-
|
89
|
-
### Tweet
|
90
|
-
|
91
|
-
- tweet_id
|
92
|
-
- text
|
93
|
-
- user_id
|
94
|
-
- screen_name
|
95
|
-
- name
|
96
|
-
- tweet_url
|
97
|
-
- created_at
|
98
|
-
|
99
|
-
|
100
134
|
## CLI Options
|
101
135
|
|
102
136
|
| Option | Description | Default |
|
103
137
|
| ------------- | ------------- | ------------- |
|
104
138
|
| `-h`, `--help` | This option displays a summary of twitterscraper. | |
|
105
139
|
| `--query` | Specify a keyword used during the search. | |
|
106
|
-
| `--start_date` |
|
107
|
-
| `--end_date` |
|
140
|
+
| `--start_date` | Used as "since:yyyy-mm-dd for your query. This means "since the date". | |
|
141
|
+
| `--end_date` | Used as "until:yyyy-mm-dd for your query. This means "before the date". | |
|
108
142
|
| `--lang` | Retrieve tweets written in a specific language. | |
|
109
143
|
| `--limit` | Stop scraping when *at least* the number of tweets indicated with --limit is scraped. | 100 |
|
110
144
|
| `--threads` | Set the number of threads twitterscraper-ruby should initiate while scraping for your query. | 2 |
|
111
145
|
| `--proxy` | Scrape https://twitter.com/search via proxies. | false |
|
146
|
+
| `--cache` | Enable caching. | false |
|
147
|
+
| `--format` | The format of the output. | json |
|
112
148
|
| `--output` | The name of the output file. | tweets.json |
|
149
|
+
| `--verbose` | Print debug messages. | tweets.json |
|
113
150
|
|
114
151
|
|
115
152
|
## Contributing
|
data/bin/twitterscraper
CHANGED
data/lib/twitterscraper.rb
CHANGED
@@ -2,9 +2,11 @@ require 'twitterscraper/logger'
|
|
2
2
|
require 'twitterscraper/proxy'
|
3
3
|
require 'twitterscraper/http'
|
4
4
|
require 'twitterscraper/lang'
|
5
|
+
require 'twitterscraper/cache'
|
5
6
|
require 'twitterscraper/query'
|
6
7
|
require 'twitterscraper/client'
|
7
8
|
require 'twitterscraper/tweet'
|
9
|
+
require 'twitterscraper/template'
|
8
10
|
require 'version'
|
9
11
|
|
10
12
|
module Twitterscraper
|
@@ -0,0 +1,69 @@
|
|
1
|
+
require 'base64'
|
2
|
+
require 'digest/md5'
|
3
|
+
|
4
|
+
module Twitterscraper
|
5
|
+
class Cache
|
6
|
+
def initialize()
|
7
|
+
@ttl = 3600 # 1 hour
|
8
|
+
@dir = 'cache'
|
9
|
+
Dir.mkdir(@dir) unless File.exist?(@dir)
|
10
|
+
end
|
11
|
+
|
12
|
+
def read(key)
|
13
|
+
key = cache_key(key)
|
14
|
+
file = File.join(@dir, key)
|
15
|
+
entry = Entry.from_json(File.read(file))
|
16
|
+
entry.value if entry.time > Time.now - @ttl
|
17
|
+
rescue Errno::ENOENT => e
|
18
|
+
nil
|
19
|
+
end
|
20
|
+
|
21
|
+
def write(key, value)
|
22
|
+
key = cache_key(key)
|
23
|
+
entry = Entry.new(key, value, Time.now)
|
24
|
+
file = File.join(@dir, key)
|
25
|
+
File.write(file, entry.to_json)
|
26
|
+
end
|
27
|
+
|
28
|
+
def fetch(key, &block)
|
29
|
+
if (value = read(key))
|
30
|
+
value
|
31
|
+
else
|
32
|
+
yield.tap { |v| write(key, v) }
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def cache_key(key)
|
37
|
+
value = key.gsub(':', '%3A').gsub('/', '%2F').gsub('?', '%3F').gsub('=', '%3D').gsub('&', '%26')
|
38
|
+
value = Digest::MD5.hexdigest(value) if value.length >= 100
|
39
|
+
value
|
40
|
+
end
|
41
|
+
|
42
|
+
class Entry < Hash
|
43
|
+
attr_reader :key, :value, :time
|
44
|
+
|
45
|
+
def initialize(key, value, time)
|
46
|
+
@key = key
|
47
|
+
@value = value
|
48
|
+
@time = time
|
49
|
+
end
|
50
|
+
|
51
|
+
def attrs
|
52
|
+
{key: @key, value: @value, time: @time}
|
53
|
+
end
|
54
|
+
|
55
|
+
def to_json
|
56
|
+
hash = attrs
|
57
|
+
hash[:value] = Base64.encode64(hash[:value])
|
58
|
+
hash.to_json
|
59
|
+
end
|
60
|
+
|
61
|
+
class << self
|
62
|
+
def from_json(text)
|
63
|
+
json = JSON.parse(text)
|
64
|
+
new(json['key'], Base64.decode64(json['value']), Time.parse(json['time']))
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
data/lib/twitterscraper/cli.rb
CHANGED
@@ -20,12 +20,25 @@ module Twitterscraper
|
|
20
20
|
end_date: options['end_date'],
|
21
21
|
lang: options['lang'],
|
22
22
|
limit: options['limit'],
|
23
|
+
daily_limit: options['daily_limit'],
|
23
24
|
threads: options['threads'],
|
24
25
|
proxy: options['proxy']
|
25
26
|
}
|
26
|
-
client = Twitterscraper::Client.new
|
27
|
+
client = Twitterscraper::Client.new(cache: options['cache'])
|
27
28
|
tweets = client.query_tweets(options['query'], query_options)
|
28
|
-
|
29
|
+
export(tweets) unless tweets.empty?
|
30
|
+
end
|
31
|
+
|
32
|
+
def export(tweets)
|
33
|
+
write_json = lambda { File.write(options['output'], generate_json(tweets)) }
|
34
|
+
|
35
|
+
if options['format'] == 'json'
|
36
|
+
write_json.call
|
37
|
+
elsif options['format'] == 'html'
|
38
|
+
File.write('tweets.html', Template.tweets_embedded_html(tweets))
|
39
|
+
else
|
40
|
+
write_json.call
|
41
|
+
end
|
29
42
|
end
|
30
43
|
|
31
44
|
def generate_json(tweets)
|
@@ -51,17 +64,23 @@ module Twitterscraper
|
|
51
64
|
'end_date:',
|
52
65
|
'lang:',
|
53
66
|
'limit:',
|
67
|
+
'daily_limit:',
|
54
68
|
'threads:',
|
55
69
|
'output:',
|
70
|
+
'format:',
|
71
|
+
'cache',
|
56
72
|
'proxy',
|
57
73
|
'pretty',
|
58
74
|
'verbose',
|
59
75
|
)
|
60
76
|
|
77
|
+
options['start_date'] = Query::OLDEST_DATE if options['start_date'] == 'oldest'
|
61
78
|
options['lang'] ||= ''
|
62
79
|
options['limit'] = (options['limit'] || 100).to_i
|
80
|
+
options['daily_limit'] = options['daily_limit'].to_i if options['daily_limit']
|
63
81
|
options['threads'] = (options['threads'] || 2).to_i
|
64
|
-
options['
|
82
|
+
options['format'] ||= 'json'
|
83
|
+
options['output'] ||= "tweets.#{options['format']}"
|
65
84
|
|
66
85
|
options
|
67
86
|
end
|
@@ -86,7 +105,7 @@ module Twitterscraper
|
|
86
105
|
end
|
87
106
|
|
88
107
|
def print_version
|
89
|
-
puts "twitterscraper-#{
|
108
|
+
puts "twitterscraper-#{VERSION}"
|
90
109
|
end
|
91
110
|
end
|
92
111
|
end
|
data/lib/twitterscraper/proxy.rb
CHANGED
@@ -17,15 +17,17 @@ module Twitterscraper
|
|
17
17
|
reload
|
18
18
|
end
|
19
19
|
@cur_index += 1
|
20
|
-
|
21
|
-
Twitterscraper.logger.info("Using proxy #{item}")
|
22
|
-
item
|
20
|
+
@items[@cur_index - 1]
|
23
21
|
end
|
24
22
|
|
25
23
|
def size
|
26
24
|
@items.size
|
27
25
|
end
|
28
26
|
|
27
|
+
def empty?
|
28
|
+
@items.empty?
|
29
|
+
end
|
30
|
+
|
29
31
|
private
|
30
32
|
|
31
33
|
def reload
|
@@ -51,7 +53,6 @@ module Twitterscraper
|
|
51
53
|
proxies << ip + ':' + port
|
52
54
|
end
|
53
55
|
|
54
|
-
Twitterscraper.logger.debug "Fetch #{proxies.size} proxies"
|
55
56
|
proxies.shuffle
|
56
57
|
rescue => e
|
57
58
|
if (retries -= 1) > 0
|
data/lib/twitterscraper/query.rb
CHANGED
@@ -44,14 +44,18 @@ module Twitterscraper
|
|
44
44
|
|
45
45
|
def get_single_page(url, headers, proxies, timeout = 6, retries = 30)
|
46
46
|
return nil if stop_requested?
|
47
|
-
|
47
|
+
unless proxies.empty?
|
48
|
+
proxy = proxies.sample
|
49
|
+
logger.info("Using proxy #{proxy}")
|
50
|
+
end
|
51
|
+
Http.get(url, headers, proxy, timeout)
|
48
52
|
rescue => e
|
49
53
|
logger.debug "query_single_page: #{e.inspect}"
|
50
54
|
if (retries -= 1) > 0
|
51
|
-
logger.info
|
55
|
+
logger.info "Retrying... (Attempts left: #{retries - 1})"
|
52
56
|
retry
|
53
57
|
else
|
54
|
-
raise
|
58
|
+
raise Error.new("#{e.inspect} url=#{url}")
|
55
59
|
end
|
56
60
|
end
|
57
61
|
|
@@ -71,14 +75,27 @@ module Twitterscraper
|
|
71
75
|
end
|
72
76
|
|
73
77
|
def query_single_page(query, lang, pos, from_user = false, headers: [], proxies: [])
|
74
|
-
logger.info
|
78
|
+
logger.info "Querying #{query}"
|
75
79
|
query = ERB::Util.url_encode(query)
|
76
80
|
|
77
81
|
url = build_query_url(query, lang, pos, from_user)
|
78
|
-
|
82
|
+
http_request = lambda do
|
83
|
+
logger.debug "Scraping tweets from #{url}"
|
84
|
+
get_single_page(url, headers, proxies)
|
85
|
+
end
|
79
86
|
|
80
|
-
|
81
|
-
|
87
|
+
if cache_enabled?
|
88
|
+
client = Cache.new
|
89
|
+
if (response = client.read(url))
|
90
|
+
logger.debug 'Fetching tweets from cache'
|
91
|
+
else
|
92
|
+
response = http_request.call
|
93
|
+
client.write(url, response) unless stop_requested?
|
94
|
+
end
|
95
|
+
else
|
96
|
+
response = http_request.call
|
97
|
+
end
|
98
|
+
return [], nil if response.nil? || response.empty?
|
82
99
|
|
83
100
|
html, json_resp = parse_single_page(response, pos.nil?)
|
84
101
|
|
@@ -97,35 +114,35 @@ module Twitterscraper
|
|
97
114
|
end
|
98
115
|
end
|
99
116
|
|
100
|
-
OLDEST_DATE = Date.parse('2006-
|
117
|
+
OLDEST_DATE = Date.parse('2006-03-21')
|
101
118
|
|
102
119
|
def validate_options!(query, start_date:, end_date:, lang:, limit:, threads:, proxy:)
|
103
120
|
if query.nil? || query == ''
|
104
|
-
raise 'Please specify a search query.'
|
121
|
+
raise Error.new('Please specify a search query.')
|
105
122
|
end
|
106
123
|
|
107
124
|
if ERB::Util.url_encode(query).length >= 500
|
108
|
-
raise ':query must be a UTF-8, URL-encoded search query of 500 characters maximum, including operators.'
|
125
|
+
raise Error.new(':query must be a UTF-8, URL-encoded search query of 500 characters maximum, including operators.')
|
109
126
|
end
|
110
127
|
|
111
128
|
if start_date && end_date
|
112
129
|
if start_date == end_date
|
113
|
-
raise 'Please specify different values for :start_date and :end_date.'
|
130
|
+
raise Error.new('Please specify different values for :start_date and :end_date.')
|
114
131
|
elsif start_date > end_date
|
115
|
-
raise ':start_date must occur before :end_date.'
|
132
|
+
raise Error.new(':start_date must occur before :end_date.')
|
116
133
|
end
|
117
134
|
end
|
118
135
|
|
119
136
|
if start_date
|
120
137
|
if start_date < OLDEST_DATE
|
121
|
-
raise ":start_date must be greater than or equal to #{OLDEST_DATE}"
|
138
|
+
raise Error.new(":start_date must be greater than or equal to #{OLDEST_DATE}")
|
122
139
|
end
|
123
140
|
end
|
124
141
|
|
125
142
|
if end_date
|
126
143
|
today = Date.today
|
127
144
|
if end_date > Date.today
|
128
|
-
raise ":end_date must be less than or equal to today(#{today})"
|
145
|
+
raise Error.new(":end_date must be less than or equal to today(#{today})")
|
129
146
|
end
|
130
147
|
end
|
131
148
|
end
|
@@ -143,27 +160,32 @@ module Twitterscraper
|
|
143
160
|
end
|
144
161
|
end
|
145
162
|
|
146
|
-
def main_loop(query, lang, limit, headers, proxies)
|
163
|
+
def main_loop(query, lang, limit, daily_limit, headers, proxies)
|
147
164
|
pos = nil
|
165
|
+
daily_tweets = []
|
148
166
|
|
149
167
|
while true
|
150
168
|
new_tweets, new_pos = query_single_page(query, lang, pos, headers: headers, proxies: proxies)
|
151
169
|
unless new_tweets.empty?
|
170
|
+
daily_tweets.concat(new_tweets)
|
171
|
+
daily_tweets.uniq! { |t| t.tweet_id }
|
172
|
+
|
152
173
|
@mutex.synchronize {
|
153
174
|
@all_tweets.concat(new_tweets)
|
154
175
|
@all_tweets.uniq! { |t| t.tweet_id }
|
155
176
|
}
|
156
177
|
end
|
157
|
-
logger.info
|
178
|
+
logger.info "Got #{new_tweets.size} tweets (total #{@all_tweets.size})"
|
158
179
|
|
159
180
|
break unless new_pos
|
181
|
+
break if daily_limit && daily_tweets.size >= daily_limit
|
160
182
|
break if @all_tweets.size >= limit
|
161
183
|
|
162
184
|
pos = new_pos
|
163
185
|
end
|
164
186
|
|
165
|
-
if @all_tweets.size >= limit
|
166
|
-
logger.
|
187
|
+
if !@stop_requested && @all_tweets.size >= limit
|
188
|
+
logger.warn "The limit you specified has been reached limit=#{limit} tweets=#{@all_tweets.size}"
|
167
189
|
@stop_requested = true
|
168
190
|
end
|
169
191
|
end
|
@@ -172,32 +194,36 @@ module Twitterscraper
|
|
172
194
|
@stop_requested
|
173
195
|
end
|
174
196
|
|
175
|
-
def query_tweets(query, start_date: nil, end_date: nil, lang: '', limit: 100, threads: 2, proxy: false)
|
197
|
+
def query_tweets(query, start_date: nil, end_date: nil, lang: '', limit: 100, daily_limit: nil, threads: 2, proxy: false)
|
176
198
|
start_date = Date.parse(start_date) if start_date && start_date.is_a?(String)
|
177
199
|
end_date = Date.parse(end_date) if end_date && end_date.is_a?(String)
|
178
200
|
queries = build_queries(query, start_date, end_date)
|
179
201
|
threads = queries.size if threads > queries.size
|
180
|
-
proxies = proxy ?
|
202
|
+
proxies = proxy ? Proxy::Pool.new : []
|
181
203
|
|
182
204
|
validate_options!(queries[0], start_date: start_date, end_date: end_date, lang: lang, limit: limit, threads: threads, proxy: proxy)
|
183
205
|
|
184
|
-
logger.
|
206
|
+
logger.debug "Fetch #{proxies.size} proxies" if proxy
|
207
|
+
logger.info "The number of threads #{threads}"
|
185
208
|
|
186
209
|
headers = {'User-Agent': USER_AGENT_LIST.sample, 'X-Requested-With': 'XMLHttpRequest'}
|
187
|
-
logger.info
|
210
|
+
logger.info "Headers #{headers}"
|
188
211
|
|
189
212
|
@all_tweets = []
|
190
213
|
@mutex = Mutex.new
|
191
214
|
@stop_requested = false
|
192
215
|
|
193
216
|
if threads > 1
|
217
|
+
Thread.abort_on_exception = true
|
218
|
+
logger.debug "Set 'Thread.abort_on_exception' to true"
|
219
|
+
|
194
220
|
Parallel.each(queries, in_threads: threads) do |query|
|
195
|
-
main_loop(query, lang, limit, headers, proxies)
|
221
|
+
main_loop(query, lang, limit, daily_limit, headers, proxies)
|
196
222
|
raise Parallel::Break if stop_requested?
|
197
223
|
end
|
198
224
|
else
|
199
225
|
queries.each do |query|
|
200
|
-
main_loop(query, lang, limit, headers, proxies)
|
226
|
+
main_loop(query, lang, limit, daily_limit, headers, proxies)
|
201
227
|
break if stop_requested?
|
202
228
|
end
|
203
229
|
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
module Twitterscraper
|
2
|
+
module Template
|
3
|
+
module_function
|
4
|
+
|
5
|
+
def tweets_embedded_html(tweets)
|
6
|
+
tweets_html = tweets.map { |t| EMBED_TWEET_HTML.sub('__TWEET_URL__', t.tweet_url) }
|
7
|
+
EMBED_TWEETS_HTML.sub('__TWEETS__', tweets_html.join)
|
8
|
+
end
|
9
|
+
|
10
|
+
EMBED_TWEET_HTML = <<~'HTML'
|
11
|
+
<blockquote class="twitter-tweet">
|
12
|
+
<a href="__TWEET_URL__"></a>
|
13
|
+
</blockquote>
|
14
|
+
HTML
|
15
|
+
|
16
|
+
EMBED_TWEETS_HTML = <<~'HTML'
|
17
|
+
<html>
|
18
|
+
<head>
|
19
|
+
<style type=text/css>
|
20
|
+
.twitter-tweet {
|
21
|
+
margin: 30px auto 0 auto !important;
|
22
|
+
}
|
23
|
+
</style>
|
24
|
+
<script>
|
25
|
+
window.twttr = (function(d, s, id) {
|
26
|
+
var js, fjs = d.getElementsByTagName(s)[0], t = window.twttr || {};
|
27
|
+
if (d.getElementById(id)) return t;
|
28
|
+
js = d.createElement(s);
|
29
|
+
js.id = id;
|
30
|
+
js.src = "https://platform.twitter.com/widgets.js";
|
31
|
+
fjs.parentNode.insertBefore(js, fjs);
|
32
|
+
|
33
|
+
t._e = [];
|
34
|
+
t.ready = function(f) {
|
35
|
+
t._e.push(f);
|
36
|
+
};
|
37
|
+
|
38
|
+
return t;
|
39
|
+
}(document, "script", "twitter-wjs"));
|
40
|
+
</script>
|
41
|
+
</head>
|
42
|
+
<body>
|
43
|
+
__TWEETS__
|
44
|
+
</body>
|
45
|
+
</html>
|
46
|
+
HTML
|
47
|
+
end
|
48
|
+
end
|
data/lib/twitterscraper/tweet.rb
CHANGED
@@ -2,7 +2,28 @@ require 'time'
|
|
2
2
|
|
3
3
|
module Twitterscraper
|
4
4
|
class Tweet
|
5
|
-
KEYS = [
|
5
|
+
KEYS = [
|
6
|
+
:screen_name,
|
7
|
+
:name,
|
8
|
+
:user_id,
|
9
|
+
:tweet_id,
|
10
|
+
:text,
|
11
|
+
:links,
|
12
|
+
:hashtags,
|
13
|
+
:image_urls,
|
14
|
+
:video_url,
|
15
|
+
:has_media,
|
16
|
+
:likes,
|
17
|
+
:retweets,
|
18
|
+
:replies,
|
19
|
+
:is_replied,
|
20
|
+
:is_reply_to,
|
21
|
+
:parent_tweet_id,
|
22
|
+
:reply_to_users,
|
23
|
+
:tweet_url,
|
24
|
+
:timestamp,
|
25
|
+
:created_at,
|
26
|
+
]
|
6
27
|
attr_reader *KEYS
|
7
28
|
|
8
29
|
def initialize(attrs)
|
@@ -11,13 +32,25 @@ module Twitterscraper
|
|
11
32
|
end
|
12
33
|
end
|
13
34
|
|
14
|
-
def
|
35
|
+
def attrs
|
15
36
|
KEYS.map do |key|
|
16
37
|
[key, send(key)]
|
17
|
-
end.to_h
|
38
|
+
end.to_h
|
39
|
+
end
|
40
|
+
|
41
|
+
def to_json(options = {})
|
42
|
+
attrs.to_json
|
18
43
|
end
|
19
44
|
|
20
45
|
class << self
|
46
|
+
def from_json(text)
|
47
|
+
json = JSON.parse(text)
|
48
|
+
json.map do |tweet|
|
49
|
+
tweet['created_at'] = Time.parse(tweet['created_at'])
|
50
|
+
new(tweet)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
21
54
|
def from_html(text)
|
22
55
|
html = Nokogiri::HTML(text)
|
23
56
|
from_tweets_html(html.xpath("//li[@class[contains(., 'js-stream-item')]]/div[@class[contains(., 'js-stream-tweet')]]"))
|
@@ -26,20 +59,63 @@ module Twitterscraper
|
|
26
59
|
def from_tweets_html(html)
|
27
60
|
html.map do |tweet|
|
28
61
|
from_tweet_html(tweet)
|
29
|
-
end
|
62
|
+
end.compact
|
30
63
|
end
|
31
64
|
|
32
65
|
def from_tweet_html(html)
|
66
|
+
screen_name = html.attr('data-screen-name')
|
67
|
+
tweet_id = html.attr('data-tweet-id')&.to_i
|
68
|
+
|
69
|
+
unless html.to_s.include?('js-tweet-text-container')
|
70
|
+
Twitterscraper.logger.warn "html doesn't include div.js-tweet-text-container url=https://twitter.com/#{screen_name}/status/#{tweet_id}"
|
71
|
+
return nil
|
72
|
+
end
|
73
|
+
|
33
74
|
inner_html = Nokogiri::HTML(html.inner_html)
|
75
|
+
text = inner_html.xpath("//div[@class[contains(., 'js-tweet-text-container')]]/p[@class[contains(., 'js-tweet-text')]]").first.text
|
76
|
+
links = inner_html.xpath("//a[@class[contains(., 'twitter-timeline-link')]]").map { |elem| elem.attr('data-expanded-url') }.select { |link| link && !link.include?('pic.twitter') }
|
77
|
+
image_urls = inner_html.xpath("//div[@class[contains(., 'AdaptiveMedia-photoContainer')]]").map { |elem| elem.attr('data-image-url') }
|
78
|
+
video_url = inner_html.xpath("//div[@class[contains(., 'PlayableMedia-container')]]/a").map { |elem| elem.attr('href') }[0]
|
79
|
+
has_media = !image_urls.empty? || (video_url && !video_url.empty?)
|
80
|
+
|
81
|
+
actions = inner_html.xpath("//div[@class[contains(., 'ProfileTweet-actionCountList')]]")
|
82
|
+
likes = actions.xpath("//span[@class[contains(., 'ProfileTweet-action--favorite')]]/span[@class[contains(., 'ProfileTweet-actionCount')]]").first.attr('data-tweet-stat-count').to_i || 0
|
83
|
+
retweets = actions.xpath("//span[@class[contains(., 'ProfileTweet-action--retweet')]]/span[@class[contains(., 'ProfileTweet-actionCount')]]").first.attr('data-tweet-stat-count').to_i || 0
|
84
|
+
replies = actions.xpath("//span[@class[contains(., 'ProfileTweet-action--reply u-hiddenVisually')]]/span[@class[contains(., 'ProfileTweet-actionCount')]]").first.attr('data-tweet-stat-count').to_i || 0
|
85
|
+
is_replied = replies != 0
|
86
|
+
|
87
|
+
parent_tweet_id = inner_html.xpath('//*[@data-conversation-id]').first.attr('data-conversation-id').to_i
|
88
|
+
if tweet_id == parent_tweet_id
|
89
|
+
is_reply_to = false
|
90
|
+
parent_tweet_id = nil
|
91
|
+
reply_to_users = []
|
92
|
+
else
|
93
|
+
is_reply_to = true
|
94
|
+
reply_to_users = inner_html.xpath("//div[@class[contains(., 'ReplyingToContextBelowAuthor')]]/a").map { |user| {screen_name: user.text.delete_prefix('@'), user_id: user.attr('data-user-id')} }
|
95
|
+
end
|
96
|
+
|
34
97
|
timestamp = inner_html.xpath("//span[@class[contains(., 'js-short-timestamp')]]").first.attr('data-time').to_i
|
35
98
|
new(
|
36
|
-
screen_name:
|
99
|
+
screen_name: screen_name,
|
37
100
|
name: html.attr('data-name'),
|
38
101
|
user_id: html.attr('data-user-id').to_i,
|
39
|
-
tweet_id:
|
102
|
+
tweet_id: tweet_id,
|
103
|
+
text: text,
|
104
|
+
links: links,
|
105
|
+
hashtags: text.scan(/#\w+/).map { |tag| tag.delete_prefix('#') },
|
106
|
+
image_urls: image_urls,
|
107
|
+
video_url: video_url,
|
108
|
+
has_media: has_media,
|
109
|
+
likes: likes,
|
110
|
+
retweets: retweets,
|
111
|
+
replies: replies,
|
112
|
+
is_replied: is_replied,
|
113
|
+
is_reply_to: is_reply_to,
|
114
|
+
parent_tweet_id: parent_tweet_id,
|
115
|
+
reply_to_users: reply_to_users,
|
40
116
|
tweet_url: 'https://twitter.com' + html.attr('data-permalink-path'),
|
117
|
+
timestamp: timestamp,
|
41
118
|
created_at: Time.at(timestamp, in: '+00:00'),
|
42
|
-
text: inner_html.xpath("//div[@class[contains(., 'js-tweet-text-container')]]/p[@class[contains(., 'js-tweet-text')]]").first.text,
|
43
119
|
)
|
44
120
|
end
|
45
121
|
end
|
data/lib/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: twitterscraper-ruby
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.12.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- ts-3156
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-07-
|
11
|
+
date: 2020-07-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -61,6 +61,7 @@ files:
|
|
61
61
|
- bin/twitterscraper
|
62
62
|
- lib/twitterscraper-ruby.rb
|
63
63
|
- lib/twitterscraper.rb
|
64
|
+
- lib/twitterscraper/cache.rb
|
64
65
|
- lib/twitterscraper/cli.rb
|
65
66
|
- lib/twitterscraper/client.rb
|
66
67
|
- lib/twitterscraper/http.rb
|
@@ -68,6 +69,7 @@ files:
|
|
68
69
|
- lib/twitterscraper/logger.rb
|
69
70
|
- lib/twitterscraper/proxy.rb
|
70
71
|
- lib/twitterscraper/query.rb
|
72
|
+
- lib/twitterscraper/template.rb
|
71
73
|
- lib/twitterscraper/tweet.rb
|
72
74
|
- lib/version.rb
|
73
75
|
- twitterscraper-ruby.gemspec
|