twitterscraper-ruby 0.15.0 → 0.18.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/Gemfile.lock +1 -1
- data/README.md +70 -46
- data/lib/twitterscraper.rb +1 -0
- data/lib/twitterscraper/cache.rb +7 -1
- data/lib/twitterscraper/cli.rb +17 -6
- data/lib/twitterscraper/query.rb +59 -28
- data/lib/twitterscraper/template.rb +53 -42
- data/lib/twitterscraper/template/tweets.html.erb +109 -0
- data/lib/twitterscraper/tweet.rb +9 -0
- data/lib/twitterscraper/type.rb +15 -0
- data/lib/version.rb +1 -1
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8e9bdefe1c4d10e6d9f1d12aeb279b2a3751c570e96e05daaf849dd423bb03bf
|
4
|
+
data.tar.gz: 7de97de19daeecce2837fe8e5999b6c9490ab49a18a2ab9e603bf4d039abc4b9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 55b7e0b52b2ce44418305798ed27a677405244a48f5ad0a797e3abf7958b0581a313ebd33f3f69b891ba7454f8f5c9c0db845c9ca8be321cd27212932821776e
|
7
|
+
data.tar.gz: 8fe97a0dc164fc0108b8e6a35843fba19ade5fbaf4f1ee2b4a400afbd3bdbb220a49dfbef4fceb1d8ecc43df3b4f4b7bad0ee5ea94c0aac464c0477e42efb866
|
data/.gitignore
CHANGED
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -5,15 +5,17 @@
|
|
5
5
|
|
6
6
|
A gem to scrape https://twitter.com/search. This gem is inspired by [taspinar/twitterscraper](https://github.com/taspinar/twitterscraper).
|
7
7
|
|
8
|
+
Please feel free to ask [@ts_3156](https://twitter.com/ts_3156) if you have any questions.
|
9
|
+
|
8
10
|
|
9
11
|
## Twitter Search API vs. twitterscraper-ruby
|
10
12
|
|
11
|
-
|
13
|
+
#### Twitter Search API
|
12
14
|
|
13
15
|
- The number of tweets: 180 - 450 requests/15 minutes (18,000 - 45,000 tweets/15 minutes)
|
14
16
|
- The time window: the past 7 days
|
15
17
|
|
16
|
-
|
18
|
+
#### twitterscraper-ruby
|
17
19
|
|
18
20
|
- The number of tweets: Unlimited
|
19
21
|
- The time window: from 2006-3-21 to today
|
@@ -30,37 +32,49 @@ $ gem install twitterscraper-ruby
|
|
30
32
|
|
31
33
|
## Usage
|
32
34
|
|
33
|
-
Command-line interface:
|
35
|
+
#### Command-line interface:
|
36
|
+
|
37
|
+
Returns a collection of relevant tweets matching a specified query.
|
34
38
|
|
35
39
|
```shell script
|
36
|
-
# Returns a collection of relevant tweets matching a specified query.
|
37
40
|
$ twitterscraper --type search --query KEYWORD --start_date 2020-06-01 --end_date 2020-06-30 --lang ja \
|
38
41
|
--limit 100 --threads 10 --output tweets.json
|
39
42
|
```
|
40
43
|
|
44
|
+
Returns a collection of the most recent tweets posted by the user indicated by the screen_name
|
45
|
+
|
41
46
|
```shell script
|
42
|
-
# Returns a collection of the most recent tweets posted by the user indicated by the screen_name
|
43
47
|
$ twitterscraper --type user --query SCREEN_NAME --limit 100 --output tweets.json
|
44
48
|
```
|
45
49
|
|
46
|
-
From Within Ruby:
|
50
|
+
#### From Within Ruby:
|
47
51
|
|
48
52
|
```ruby
|
49
53
|
require 'twitterscraper'
|
50
54
|
client = Twitterscraper::Client.new(cache: true, proxy: true)
|
51
55
|
```
|
52
56
|
|
57
|
+
Returns a collection of relevant tweets matching a specified query.
|
58
|
+
|
53
59
|
```ruby
|
54
|
-
# Returns a collection of relevant tweets matching a specified query.
|
55
60
|
tweets = client.search(KEYWORD, start_date: '2020-06-01', end_date: '2020-06-30', lang: 'ja', limit: 100, threads: 10)
|
56
61
|
```
|
57
62
|
|
63
|
+
Returns a collection of the most recent tweets posted by the user indicated by the screen_name
|
64
|
+
|
58
65
|
```ruby
|
59
|
-
# Returns a collection of the most recent tweets posted by the user indicated by the screen_name
|
60
66
|
tweets = client.user_timeline(SCREEN_NAME, limit: 100)
|
61
67
|
```
|
62
68
|
|
63
69
|
|
70
|
+
## Examples
|
71
|
+
|
72
|
+
```shell script
|
73
|
+
$ twitterscraper --query twitter --limit 1000
|
74
|
+
$ cat tweets.json | jq . | less
|
75
|
+
```
|
76
|
+
|
77
|
+
|
64
78
|
## Attributes
|
65
79
|
|
66
80
|
### Tweet
|
@@ -72,14 +86,44 @@ tweets.each do |tweet|
|
|
72
86
|
puts tweet.tweet_url
|
73
87
|
puts tweet.created_at
|
74
88
|
|
89
|
+
attr_names = hash.keys
|
75
90
|
hash = tweet.attrs
|
76
|
-
|
91
|
+
json = tweet.to_json
|
77
92
|
end
|
78
93
|
```
|
79
94
|
|
95
|
+
```json
|
96
|
+
[
|
97
|
+
{
|
98
|
+
"screen_name": "@name",
|
99
|
+
"name": "Name",
|
100
|
+
"user_id": 12340000,
|
101
|
+
"profile_image_url": "https://pbs.twimg.com/profile_images/1826000000/0000.png",
|
102
|
+
"tweet_id": 1234000000000000,
|
103
|
+
"text": "Thanks Twitter!",
|
104
|
+
"links": [],
|
105
|
+
"hashtags": [],
|
106
|
+
"image_urls": [],
|
107
|
+
"video_url": null,
|
108
|
+
"has_media": null,
|
109
|
+
"likes": 10,
|
110
|
+
"retweets": 20,
|
111
|
+
"replies": 0,
|
112
|
+
"is_replied": false,
|
113
|
+
"is_reply_to": false,
|
114
|
+
"parent_tweet_id": null,
|
115
|
+
"reply_to_users": [],
|
116
|
+
"tweet_url": "https://twitter.com/name/status/1234000000000000",
|
117
|
+
"timestamp": 1594793000,
|
118
|
+
"created_at": "2020-07-15 00:00:00 +0000"
|
119
|
+
}
|
120
|
+
]
|
121
|
+
```
|
122
|
+
|
80
123
|
- screen_name
|
81
124
|
- name
|
82
125
|
- user_id
|
126
|
+
- profile_image_url
|
83
127
|
- tweet_id
|
84
128
|
- text
|
85
129
|
- links
|
@@ -118,45 +162,25 @@ end
|
|
118
162
|
Search operators documentation is in [Standard search operators](https://developer.twitter.com/en/docs/tweets/rules-and-filtering/overview/standard-operators).
|
119
163
|
|
120
164
|
|
121
|
-
## Examples
|
122
|
-
|
123
|
-
```shell script
|
124
|
-
$ twitterscraper --query twitter --limit 1000
|
125
|
-
$ cat tweets.json | jq . | less
|
126
|
-
```
|
127
|
-
|
128
|
-
```json
|
129
|
-
[
|
130
|
-
{
|
131
|
-
"screen_name": "@screenname",
|
132
|
-
"name": "name",
|
133
|
-
"user_id": 1194529546483000000,
|
134
|
-
"tweet_id": 1282659891992000000,
|
135
|
-
"tweet_url": "https://twitter.com/screenname/status/1282659891992000000",
|
136
|
-
"created_at": "2020-07-13 12:00:00 +0000",
|
137
|
-
"text": "Thanks Twitter!"
|
138
|
-
}
|
139
|
-
]
|
140
|
-
```
|
141
|
-
|
142
165
|
## CLI Options
|
143
166
|
|
144
|
-
| Option | Description |
|
145
|
-
| ------------- | ------------- | ------------- |
|
146
|
-
|
|
147
|
-
| `--type` | Specify a search type. | search |
|
148
|
-
| `--query` | Specify a keyword used during the search. | |
|
149
|
-
| `--start_date` | Used as "since:yyyy-mm-dd for your query. This means "since the date". | |
|
150
|
-
| `--end_date` | Used as "until:yyyy-mm-dd for your query. This means "before the date". | |
|
151
|
-
| `--lang` | Retrieve tweets written in a specific language. | |
|
152
|
-
| `--limit` | Stop scraping when *at least* the number of tweets indicated with --limit is scraped. | 100 |
|
153
|
-
| `--order` | Sort order of the results. | desc |
|
154
|
-
| `--threads` | Set the number of threads twitterscraper-ruby should initiate while scraping for your query. | 2 |
|
155
|
-
| `--
|
156
|
-
| `--
|
157
|
-
| `--
|
158
|
-
| `--
|
159
|
-
| `--
|
167
|
+
| Option | Type | Description | Value |
|
168
|
+
| ------------- | ------------- | ------------- | ------------- |
|
169
|
+
| `--help` | string | This option displays a summary of twitterscraper. | |
|
170
|
+
| `--type` | string | Specify a search type. | search(default) or user |
|
171
|
+
| `--query` | string | Specify a keyword used during the search. | |
|
172
|
+
| `--start_date` | string | Used as "since:yyyy-mm-dd for your query. This means "since the date". | |
|
173
|
+
| `--end_date` | string | Used as "until:yyyy-mm-dd for your query. This means "before the date". | |
|
174
|
+
| `--lang` | string | Retrieve tweets written in a specific language. | |
|
175
|
+
| `--limit` | integer | Stop scraping when *at least* the number of tweets indicated with --limit is scraped. | 100 |
|
176
|
+
| `--order` | string | Sort a order of the results. | desc(default) or asc |
|
177
|
+
| `--threads` | integer | Set the number of threads twitterscraper-ruby should initiate while scraping for your query. | 2 |
|
178
|
+
| `--threads_granularity` | string | | auto |
|
179
|
+
| `--proxy` | boolean | Scrape https://twitter.com/search via proxies. | true(default) or false |
|
180
|
+
| `--cache` | boolean | Enable caching. | true(default) or false |
|
181
|
+
| `--format` | string | The format of the output. | json(default) or html |
|
182
|
+
| `--output` | string | The name of the output file. | tweets.json |
|
183
|
+
| `--verbose` | | Print debug messages. | |
|
160
184
|
|
161
185
|
|
162
186
|
## Contributing
|
data/lib/twitterscraper.rb
CHANGED
data/lib/twitterscraper/cache.rb
CHANGED
@@ -4,7 +4,7 @@ require 'digest/md5'
|
|
4
4
|
module Twitterscraper
|
5
5
|
class Cache
|
6
6
|
def initialize()
|
7
|
-
@ttl =
|
7
|
+
@ttl = 86400 # 1 day
|
8
8
|
@dir = 'cache'
|
9
9
|
Dir.mkdir(@dir) unless File.exist?(@dir)
|
10
10
|
end
|
@@ -25,6 +25,12 @@ module Twitterscraper
|
|
25
25
|
File.write(file, entry.to_json)
|
26
26
|
end
|
27
27
|
|
28
|
+
def delete(key)
|
29
|
+
key = cache_key(key)
|
30
|
+
file = File.join(@dir, key)
|
31
|
+
File.delete(file) if File.exist?(file)
|
32
|
+
end
|
33
|
+
|
28
34
|
def fetch(key, &block)
|
29
35
|
if (value = read(key))
|
30
36
|
value
|
data/lib/twitterscraper/cli.rb
CHANGED
@@ -24,19 +24,22 @@ module Twitterscraper
|
|
24
24
|
daily_limit: options['daily_limit'],
|
25
25
|
order: options['order'],
|
26
26
|
threads: options['threads'],
|
27
|
+
threads_granularity: options['threads_granularity'],
|
27
28
|
}
|
28
29
|
client = Twitterscraper::Client.new(cache: options['cache'], proxy: options['proxy'])
|
29
30
|
tweets = client.query_tweets(options['query'], query_options)
|
30
|
-
export(tweets) unless tweets.empty?
|
31
|
+
export(options['query'], tweets) unless tweets.empty?
|
31
32
|
end
|
32
33
|
|
33
|
-
def export(tweets)
|
34
|
-
|
34
|
+
def export(name, tweets)
|
35
|
+
filepath = options['output']
|
36
|
+
Dir.mkdir(File.dirname(filepath)) unless File.exist?(File.dirname(filepath))
|
37
|
+
write_json = lambda { File.write(filepath, generate_json(tweets)) }
|
35
38
|
|
36
39
|
if options['format'] == 'json'
|
37
40
|
write_json.call
|
38
41
|
elsif options['format'] == 'html'
|
39
|
-
File.write(
|
42
|
+
File.write(filepath, Template.new.tweets_embedded_html(name, tweets, options))
|
40
43
|
else
|
41
44
|
write_json.call
|
42
45
|
end
|
@@ -69,6 +72,7 @@ module Twitterscraper
|
|
69
72
|
'daily_limit:',
|
70
73
|
'order:',
|
71
74
|
'threads:',
|
75
|
+
'threads_granularity:',
|
72
76
|
'output:',
|
73
77
|
'format:',
|
74
78
|
'cache:',
|
@@ -82,10 +86,11 @@ module Twitterscraper
|
|
82
86
|
options['lang'] ||= ''
|
83
87
|
options['limit'] = (options['limit'] || 100).to_i
|
84
88
|
options['daily_limit'] = options['daily_limit'].to_i if options['daily_limit']
|
85
|
-
options['threads'] = (options['threads'] ||
|
89
|
+
options['threads'] = (options['threads'] || 10).to_i
|
90
|
+
options['threads_granularity'] ||= 'auto'
|
86
91
|
options['format'] ||= 'json'
|
87
92
|
options['order'] ||= 'desc'
|
88
|
-
options['output'] ||=
|
93
|
+
options['output'] ||= build_output_name(options)
|
89
94
|
|
90
95
|
options['cache'] = options['cache'] != 'false'
|
91
96
|
options['proxy'] = options['proxy'] != 'false'
|
@@ -93,6 +98,12 @@ module Twitterscraper
|
|
93
98
|
options
|
94
99
|
end
|
95
100
|
|
101
|
+
def build_output_name(options)
|
102
|
+
query = options['query'].gsub(/[ :?#&]/, '_')
|
103
|
+
date = [options['start_date'], options['end_date']].select { |val| val && !val.empty? }.join('_')
|
104
|
+
File.join('out', [options['type'], 'tweets', date, query].compact.join('_') + '.' + options['format'])
|
105
|
+
end
|
106
|
+
|
96
107
|
def initialize_logger
|
97
108
|
Twitterscraper.logger.level = ::Logger::DEBUG if options['verbose']
|
98
109
|
end
|
data/lib/twitterscraper/query.rb
CHANGED
@@ -27,8 +27,8 @@ module Twitterscraper
|
|
27
27
|
'include_available_features=1&include_entities=1&' +
|
28
28
|
'max_position=__POS__&reset_error_state=false'
|
29
29
|
|
30
|
-
def build_query_url(query, lang,
|
31
|
-
if
|
30
|
+
def build_query_url(query, lang, type, pos)
|
31
|
+
if type.user?
|
32
32
|
if pos
|
33
33
|
RELOAD_URL_USER.sub('__USER__', query).sub('__POS__', pos.to_s)
|
34
34
|
else
|
@@ -51,7 +51,7 @@ module Twitterscraper
|
|
51
51
|
end
|
52
52
|
Http.get(url, headers, proxy, timeout)
|
53
53
|
rescue => e
|
54
|
-
logger.debug "
|
54
|
+
logger.debug "get_single_page: #{e.inspect}"
|
55
55
|
if (retries -= 1) > 0
|
56
56
|
logger.info "Retrying... (Attempts left: #{retries - 1})"
|
57
57
|
retry
|
@@ -69,7 +69,6 @@ module Twitterscraper
|
|
69
69
|
else
|
70
70
|
json_resp = JSON.parse(text)
|
71
71
|
items_html = json_resp['items_html'] || ''
|
72
|
-
logger.warn json_resp['message'] if json_resp['message'] # Sorry, you are rate limited.
|
73
72
|
end
|
74
73
|
|
75
74
|
[items_html, json_resp]
|
@@ -77,22 +76,26 @@ module Twitterscraper
|
|
77
76
|
|
78
77
|
def query_single_page(query, lang, type, pos, headers: [], proxies: [])
|
79
78
|
logger.info "Querying #{query}"
|
80
|
-
|
79
|
+
encoded_query = ERB::Util.url_encode(query)
|
81
80
|
|
82
|
-
url = build_query_url(
|
81
|
+
url = build_query_url(encoded_query, lang, type, pos)
|
83
82
|
http_request = lambda do
|
84
|
-
logger.debug "Scraping tweets from
|
83
|
+
logger.debug "Scraping tweets from url=#{url}"
|
85
84
|
get_single_page(url, headers, proxies)
|
86
85
|
end
|
87
86
|
|
88
87
|
if cache_enabled?
|
89
88
|
client = Cache.new
|
90
89
|
if (response = client.read(url))
|
91
|
-
logger.debug
|
90
|
+
logger.debug "Fetching tweets from cache url=#{url}"
|
92
91
|
else
|
93
92
|
response = http_request.call
|
94
93
|
client.write(url, response) unless stop_requested?
|
95
94
|
end
|
95
|
+
if @queries && query == @queries.last && pos.nil?
|
96
|
+
logger.debug "Delete a cache query=#{query}"
|
97
|
+
client.delete(url)
|
98
|
+
end
|
96
99
|
else
|
97
100
|
response = http_request.call
|
98
101
|
end
|
@@ -100,6 +103,12 @@ module Twitterscraper
|
|
100
103
|
|
101
104
|
html, json_resp = parse_single_page(response, pos.nil?)
|
102
105
|
|
106
|
+
if json_resp && json_resp['message']
|
107
|
+
logger.warn json_resp['message'] # Sorry, you are rate limited.
|
108
|
+
@stop_requested = true
|
109
|
+
Cache.new.delete(url) if cache_enabled?
|
110
|
+
end
|
111
|
+
|
103
112
|
tweets = Tweet.from_html(html)
|
104
113
|
|
105
114
|
if tweets.empty?
|
@@ -108,7 +117,7 @@ module Twitterscraper
|
|
108
117
|
|
109
118
|
if json_resp
|
110
119
|
[tweets, json_resp['min_position']]
|
111
|
-
elsif type
|
120
|
+
elsif type.user?
|
112
121
|
[tweets, tweets[-1].tweet_id]
|
113
122
|
else
|
114
123
|
[tweets, "TWEET-#{tweets[-1].tweet_id}-#{tweets[0].tweet_id}"]
|
@@ -140,19 +149,33 @@ module Twitterscraper
|
|
140
149
|
raise Error.new(":start_date must be greater than or equal to #{OLDEST_DATE}")
|
141
150
|
end
|
142
151
|
end
|
143
|
-
|
144
|
-
if end_date
|
145
|
-
today = Date.today
|
146
|
-
if end_date > Date.today
|
147
|
-
raise Error.new(":end_date must be less than or equal to today(#{today})")
|
148
|
-
end
|
149
|
-
end
|
150
152
|
end
|
151
153
|
|
152
|
-
def build_queries(query, start_date, end_date)
|
154
|
+
def build_queries(query, start_date, end_date, threads_granularity)
|
153
155
|
if start_date && end_date
|
154
|
-
|
155
|
-
|
156
|
+
if threads_granularity == 'auto'
|
157
|
+
threads_granularity = start_date.upto(end_date - 1).to_a.size >= 28 ? 'day' : 'hour'
|
158
|
+
end
|
159
|
+
|
160
|
+
if threads_granularity == 'day'
|
161
|
+
date_range = start_date.upto(end_date - 1)
|
162
|
+
queries = date_range.map { |date| query + " since:#{date} until:#{date + 1}" }
|
163
|
+
else
|
164
|
+
time = Time.utc(start_date.year, start_date.month, start_date.day, 0, 0, 0)
|
165
|
+
end_time = Time.utc(end_date.year, end_date.month, end_date.day, 0, 0, 0)
|
166
|
+
queries = []
|
167
|
+
|
168
|
+
while true
|
169
|
+
if time < Time.now.utc
|
170
|
+
queries << (query + " since:#{time.strftime('%Y-%m-%d_%H:00:00')}_UTC until:#{(time + 3600).strftime('%Y-%m-%d_%H:00:00')}_UTC")
|
171
|
+
end
|
172
|
+
time += 3600
|
173
|
+
break if time >= end_time
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
177
|
+
@queries = queries
|
178
|
+
|
156
179
|
elsif start_date
|
157
180
|
[query + " since:#{start_date}"]
|
158
181
|
elsif end_date
|
@@ -196,12 +219,18 @@ module Twitterscraper
|
|
196
219
|
@stop_requested
|
197
220
|
end
|
198
221
|
|
199
|
-
def query_tweets(query, type: 'search', start_date: nil, end_date: nil, lang: nil, limit: 100, daily_limit: nil, order: 'desc', threads:
|
200
|
-
|
201
|
-
|
202
|
-
|
222
|
+
def query_tweets(query, type: 'search', start_date: nil, end_date: nil, lang: nil, limit: 100, daily_limit: nil, order: 'desc', threads: 10, threads_granularity: 'auto')
|
223
|
+
type = Type.new(type)
|
224
|
+
if type.search?
|
225
|
+
start_date = Date.parse(start_date) if start_date && start_date.is_a?(String)
|
226
|
+
end_date = Date.parse(end_date) if end_date && end_date.is_a?(String)
|
227
|
+
elsif type.user?
|
228
|
+
start_date = nil
|
229
|
+
end_date = nil
|
230
|
+
end
|
231
|
+
|
232
|
+
queries = build_queries(query, start_date, end_date, threads_granularity)
|
203
233
|
if threads > queries.size
|
204
|
-
logger.warn 'The maximum number of :threads is the number of dates between :start_date and :end_date.'
|
205
234
|
threads = queries.size
|
206
235
|
end
|
207
236
|
if proxy_enabled?
|
@@ -213,9 +242,9 @@ module Twitterscraper
|
|
213
242
|
end
|
214
243
|
logger.debug "Cache #{cache_enabled? ? 'enabled' : 'disabled'}"
|
215
244
|
|
216
|
-
|
217
245
|
validate_options!(queries, type: type, start_date: start_date, end_date: end_date, lang: lang, limit: limit, threads: threads)
|
218
246
|
|
247
|
+
logger.info "The number of queries #{queries.size}"
|
219
248
|
logger.info "The number of threads #{threads}"
|
220
249
|
|
221
250
|
headers = {'User-Agent': USER_AGENT_LIST.sample, 'X-Requested-With': 'XMLHttpRequest'}
|
@@ -240,15 +269,17 @@ module Twitterscraper
|
|
240
269
|
end
|
241
270
|
end
|
242
271
|
|
272
|
+
logger.info "Return #{@all_tweets.size} tweets"
|
273
|
+
|
243
274
|
@all_tweets.sort_by { |tweet| (order == 'desc' ? -1 : 1) * tweet.created_at.to_i }
|
244
275
|
end
|
245
276
|
|
246
|
-
def search(query, start_date: nil, end_date: nil, lang: '', limit: 100, daily_limit: nil, order: 'desc', threads:
|
247
|
-
query_tweets(query, type: 'search', start_date: start_date, end_date: end_date, lang: lang, limit: limit, daily_limit: daily_limit, order: order, threads: threads)
|
277
|
+
def search(query, start_date: nil, end_date: nil, lang: '', limit: 100, daily_limit: nil, order: 'desc', threads: 10, threads_granularity: 'auto')
|
278
|
+
query_tweets(query, type: 'search', start_date: start_date, end_date: end_date, lang: lang, limit: limit, daily_limit: daily_limit, order: order, threads: threads, threads_granularity: threads_granularity)
|
248
279
|
end
|
249
280
|
|
250
281
|
def user_timeline(screen_name, limit: 100, order: 'desc')
|
251
|
-
query_tweets(screen_name, type: 'user', start_date: nil, end_date: nil, lang: nil, limit: limit, daily_limit: nil, order: order, threads: 1)
|
282
|
+
query_tweets(screen_name, type: 'user', start_date: nil, end_date: nil, lang: nil, limit: limit, daily_limit: nil, order: order, threads: 1, threads_granularity: nil)
|
252
283
|
end
|
253
284
|
end
|
254
285
|
end
|
@@ -1,48 +1,59 @@
|
|
1
1
|
module Twitterscraper
|
2
|
-
|
3
|
-
|
2
|
+
class Template
|
3
|
+
def tweets_embedded_html(name, tweets, options)
|
4
|
+
path = File.join(File.dirname(__FILE__), 'template/tweets.html.erb')
|
5
|
+
template = ERB.new(File.read(path))
|
4
6
|
|
5
|
-
|
6
|
-
|
7
|
-
|
7
|
+
tweets = tweets.sort_by { |t| t.created_at.to_i }
|
8
|
+
|
9
|
+
template.result_with_hash(
|
10
|
+
chart_name: name,
|
11
|
+
chart_data: chart_data(tweets).to_json,
|
12
|
+
first_tweet: tweets[0],
|
13
|
+
last_tweet: tweets[-1],
|
14
|
+
tweets: tweets,
|
15
|
+
convert_limit: 30,
|
16
|
+
)
|
8
17
|
end
|
9
18
|
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
19
|
+
def chart_data(tweets, trimming: true, smoothing: true)
|
20
|
+
min_interval = 5
|
21
|
+
|
22
|
+
data = tweets.each_with_object(Hash.new(0)) do |tweet, memo|
|
23
|
+
t = tweet.created_at
|
24
|
+
min = (t.min.to_f / min_interval).floor * min_interval
|
25
|
+
time = Time.new(t.year, t.month, t.day, t.hour, min, 0, '+00:00')
|
26
|
+
memo[time.to_i] += 1
|
27
|
+
end
|
28
|
+
|
29
|
+
if false && trimming
|
30
|
+
data.keys.sort.each.with_index do |timestamp, i|
|
31
|
+
break if data.size - 1 == i
|
32
|
+
if data[i] == 0 && data[i + 1] == 0
|
33
|
+
data.delete(timestamp)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
if false && smoothing
|
39
|
+
time = data.keys.min
|
40
|
+
max_time = data.keys.max
|
41
|
+
sec_interval = 60 * min_interval
|
42
|
+
|
43
|
+
while true
|
44
|
+
next_time = time + sec_interval
|
45
|
+
break if next_time + sec_interval > max_time
|
46
|
+
|
47
|
+
unless data.has_key?(next_time)
|
48
|
+
data[next_time] = (data[time] + data[next_time + sec_interval]) / 2
|
49
|
+
end
|
50
|
+
time = next_time
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
data.sort_by { |k, _| k }.map do |timestamp, count|
|
55
|
+
[timestamp * 1000, count]
|
56
|
+
end
|
57
|
+
end
|
47
58
|
end
|
48
59
|
end
|
@@ -0,0 +1,109 @@
|
|
1
|
+
<html>
|
2
|
+
<head>
|
3
|
+
<script src="https://cdnjs.cloudflare.com/ajax/libs/moment.js/2.27.0/moment.min.js" integrity="sha512-rmZcZsyhe0/MAjquhTgiUcb4d9knaFc7b5xAfju483gbEXTkeJRUMIPk6s3ySZMYUHEcjKbjLjyddGWMrNEvZg==" crossorigin="anonymous"></script>
|
4
|
+
<script src="https://cdnjs.cloudflare.com/ajax/libs/moment-timezone/0.5.31/moment-timezone-with-data.min.js" integrity="sha512-HZcf3uHWA+Y2P5KNv+F/xa87/flKVP92kUTe/KXjU8URPshczF1Dx+cL5bw0VBGhmqWAK0UbhcqxBbyiNtAnWQ==" crossorigin="anonymous"></script>
|
5
|
+
<script src="https://code.highcharts.com/stock/highstock.js"></script>
|
6
|
+
<script>
|
7
|
+
function updateTweets() {
|
8
|
+
window.twttr = (function (d, s, id) {
|
9
|
+
var js, fjs = d.getElementsByTagName(s)[0], t = window.twttr || {};
|
10
|
+
if (d.getElementById(id)) return t;
|
11
|
+
js = d.createElement(s);
|
12
|
+
js.id = id;
|
13
|
+
js.src = "https://platform.twitter.com/widgets.js";
|
14
|
+
fjs.parentNode.insertBefore(js, fjs);
|
15
|
+
|
16
|
+
t._e = [];
|
17
|
+
t.ready = function (f) {
|
18
|
+
t._e.push(f);
|
19
|
+
};
|
20
|
+
|
21
|
+
return t;
|
22
|
+
}(document, "script", "twitter-wjs"));
|
23
|
+
}
|
24
|
+
|
25
|
+
function drawChart() {
|
26
|
+
Highcharts.setOptions({
|
27
|
+
time: {
|
28
|
+
timezone: moment.tz.guess()
|
29
|
+
}
|
30
|
+
});
|
31
|
+
|
32
|
+
var data = <%= chart_data %>;
|
33
|
+
var config = {
|
34
|
+
title: {
|
35
|
+
text: '<%= tweets.size %> tweets of <%= chart_name %>'
|
36
|
+
},
|
37
|
+
subtitle: {
|
38
|
+
text: 'since:<%= first_tweet.created_at.localtime.strftime('%Y-%m-%d %H:%M') %> until:<%= last_tweet.created_at.localtime.strftime('%Y-%m-%d %H:%M') %>'
|
39
|
+
},
|
40
|
+
series: [{
|
41
|
+
data: data
|
42
|
+
}],
|
43
|
+
rangeSelector: {enabled: false},
|
44
|
+
scrollbar: {enabled: false},
|
45
|
+
navigator: {enabled: false},
|
46
|
+
exporting: {enabled: false},
|
47
|
+
credits: {enabled: false}
|
48
|
+
};
|
49
|
+
|
50
|
+
Highcharts.stockChart('chart-container', config);
|
51
|
+
}
|
52
|
+
|
53
|
+
document.addEventListener("DOMContentLoaded", function () {
|
54
|
+
drawChart();
|
55
|
+
updateTweets();
|
56
|
+
});
|
57
|
+
</script>
|
58
|
+
|
59
|
+
<style type=text/css>
|
60
|
+
#chart-container {
|
61
|
+
max-width: 1200px;
|
62
|
+
height: 675px;
|
63
|
+
margin: 0 auto;
|
64
|
+
border: 1px solid rgb(204, 214, 221);
|
65
|
+
display: flex;
|
66
|
+
justify-content: center;
|
67
|
+
align-items: center;
|
68
|
+
}
|
69
|
+
.tweets-container {
|
70
|
+
max-width: 550px;
|
71
|
+
margin: 0 auto 0 auto;
|
72
|
+
}
|
73
|
+
|
74
|
+
.twitter-tweet {
|
75
|
+
margin: 15px 0 15px 0 !important;
|
76
|
+
}
|
77
|
+
</style>
|
78
|
+
</head>
|
79
|
+
<body>
|
80
|
+
<div id="chart-container"><div style="color: gray;">Loading...</div></div>
|
81
|
+
|
82
|
+
<div class="tweets-container">
|
83
|
+
<% tweets.sort_by { |t| -t.created_at.to_i }.each.with_index do |tweet, i| %>
|
84
|
+
<% tweet_time = tweet.created_at.localtime.strftime('%Y-%m-%d %H:%M') %>
|
85
|
+
<% if i < convert_limit %>
|
86
|
+
<blockquote class="twitter-tweet">
|
87
|
+
<% else %>
|
88
|
+
<div class="twitter-tweet" style="border: 1px solid rgb(204, 214, 221);">
|
89
|
+
<% end %>
|
90
|
+
|
91
|
+
<div style="display: grid; grid-template-rows: 24px 24px; grid-template-columns: 48px 1fr;">
|
92
|
+
<div style="grid-row: 1/3; grid-column: 1/2;"><img src="<%= tweet.profile_image_url %>" width="48" height="48" loading="lazy"></div>
|
93
|
+
<div style="grid-row: 1/2; grid-column: 2/3;"><%= tweet.name %></div>
|
94
|
+
<div style="grid-row: 2/3; grid-column: 2/3;"><a href="https://twitter.com/<%= tweet.screen_name %>">@<%= tweet.screen_name %></a></div>
|
95
|
+
</div>
|
96
|
+
|
97
|
+
<div><%= tweet.text %></div>
|
98
|
+
<div><a href="<%= tweet.tweet_url %>"><small><%= tweet_time %></small></a></div>
|
99
|
+
|
100
|
+
<% if i < convert_limit %>
|
101
|
+
</blockquote>
|
102
|
+
<% else %>
|
103
|
+
</div>
|
104
|
+
<% end %>
|
105
|
+
<% end %>
|
106
|
+
</div>
|
107
|
+
|
108
|
+
</body>
|
109
|
+
</html>
|
data/lib/twitterscraper/tweet.rb
CHANGED
@@ -6,6 +6,7 @@ module Twitterscraper
|
|
6
6
|
:screen_name,
|
7
7
|
:name,
|
8
8
|
:user_id,
|
9
|
+
:profile_image_url,
|
9
10
|
:tweet_id,
|
10
11
|
:text,
|
11
12
|
:links,
|
@@ -51,6 +52,11 @@ module Twitterscraper
|
|
51
52
|
end
|
52
53
|
end
|
53
54
|
|
55
|
+
# .js-stream-item
|
56
|
+
# .js-stream-tweet{data: {screen-name:, tweet-id:}}
|
57
|
+
# .stream-item-header
|
58
|
+
# .js-tweet-text-container
|
59
|
+
# .stream-item-footer
|
54
60
|
def from_html(text)
|
55
61
|
html = Nokogiri::HTML(text)
|
56
62
|
from_tweets_html(html.xpath("//li[@class[contains(., 'js-stream-item')]]/div[@class[contains(., 'js-stream-tweet')]]"))
|
@@ -72,6 +78,8 @@ module Twitterscraper
|
|
72
78
|
end
|
73
79
|
|
74
80
|
inner_html = Nokogiri::HTML(html.inner_html)
|
81
|
+
|
82
|
+
profile_image_url = inner_html.xpath("//img[@class[contains(., 'js-action-profile-avatar')]]").first.attr('src').gsub(/_bigger/, '')
|
75
83
|
text = inner_html.xpath("//div[@class[contains(., 'js-tweet-text-container')]]/p[@class[contains(., 'js-tweet-text')]]").first.text
|
76
84
|
links = inner_html.xpath("//a[@class[contains(., 'twitter-timeline-link')]]").map { |elem| elem.attr('data-expanded-url') }.select { |link| link && !link.include?('pic.twitter') }
|
77
85
|
image_urls = inner_html.xpath("//div[@class[contains(., 'AdaptiveMedia-photoContainer')]]").map { |elem| elem.attr('data-image-url') }
|
@@ -99,6 +107,7 @@ module Twitterscraper
|
|
99
107
|
screen_name: screen_name,
|
100
108
|
name: html.attr('data-name'),
|
101
109
|
user_id: html.attr('data-user-id').to_i,
|
110
|
+
profile_image_url: profile_image_url,
|
102
111
|
tweet_id: tweet_id,
|
103
112
|
text: text,
|
104
113
|
links: links,
|
data/lib/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: twitterscraper-ruby
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.18.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- ts-3156
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-07-
|
11
|
+
date: 2020-07-19 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -72,7 +72,9 @@ files:
|
|
72
72
|
- lib/twitterscraper/proxy.rb
|
73
73
|
- lib/twitterscraper/query.rb
|
74
74
|
- lib/twitterscraper/template.rb
|
75
|
+
- lib/twitterscraper/template/tweets.html.erb
|
75
76
|
- lib/twitterscraper/tweet.rb
|
77
|
+
- lib/twitterscraper/type.rb
|
76
78
|
- lib/version.rb
|
77
79
|
- twitterscraper-ruby.gemspec
|
78
80
|
homepage: https://github.com/ts-3156/twitterscraper-ruby
|